Re: [PATCH 3/4] iommu: remove the put_resv_regions method
On 2022/7/8 17:33, Christoph Hellwig wrote: On Fri, Jul 08, 2022 at 05:00:59PM +0800, Baolu Lu wrote: Do we really need to export this symbol? It is not used beyond the iommu core code. virtio-iommu calls it and can be modular. Yes. Thanks for the explanation. Reviewed-by: Lu Baolu Best regards, baolu ___ iommu mailing list iommu@lists.linux-foundation.org https://lists.linuxfoundation.org/mailman/listinfo/iommu
Re: [PATCH 1/4] iommu: remove the unused dev_has_feat method
On 2022/7/8 16:06, Christoph Hellwig wrote: This method is never actually called. Signed-off-by: Christoph Hellwig Reviewed-by: Lu Baolu Best regards, baolu --- drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c | 1 - include/linux/iommu.h | 4 +--- 2 files changed, 1 insertion(+), 4 deletions(-) diff --git a/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c b/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c index d9c1623ec1a9a..1b6c17dd81ee4 100644 --- a/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c +++ b/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c @@ -2853,7 +2853,6 @@ static struct iommu_ops arm_smmu_ops = { .of_xlate = arm_smmu_of_xlate, .get_resv_regions = arm_smmu_get_resv_regions, .put_resv_regions = generic_iommu_put_resv_regions, - .dev_has_feat = arm_smmu_dev_has_feature, .dev_feat_enabled = arm_smmu_dev_feature_enabled, .dev_enable_feat= arm_smmu_dev_enable_feature, .dev_disable_feat = arm_smmu_dev_disable_feature, diff --git a/include/linux/iommu.h b/include/linux/iommu.h index e6abd998dbe73..a3acdb46b9391 100644 --- a/include/linux/iommu.h +++ b/include/linux/iommu.h @@ -164,8 +164,7 @@ struct iommu_iort_rmr_data { * supported, this feature must be enabled before and * disabled after %IOMMU_DEV_FEAT_SVA. * - * Device drivers query whether a feature is supported using - * iommu_dev_has_feature(), and enable it using iommu_dev_enable_feature(). + * Device drivers enable a feature using iommu_dev_enable_feature(). */ enum iommu_dev_features { IOMMU_DEV_FEAT_SVA, @@ -248,7 +247,6 @@ struct iommu_ops { bool (*is_attach_deferred)(struct device *dev); /* Per device IOMMU features */ - bool (*dev_has_feat)(struct device *dev, enum iommu_dev_features f); bool (*dev_feat_enabled)(struct device *dev, enum iommu_dev_features f); int (*dev_enable_feat)(struct device *dev, enum iommu_dev_features f); int (*dev_disable_feat)(struct device *dev, enum iommu_dev_features f); ___ iommu mailing list iommu@lists.linux-foundation.org https://lists.linuxfoundation.org/mailman/listinfo/iommu
Re: [PATCH 2/4] iommu: remove iommu_dev_feature_enabled
On 2022/7/8 16:06, Christoph Hellwig wrote: Remove the unused iommu_dev_feature_enabled function. Signed-off-by: Christoph Hellwig A nit comment below, anyway Reviewed-by: Lu Baolu --- drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c | 1 - drivers/iommu/iommu.c | 13 - include/linux/iommu.h | 9 - 3 files changed, 23 deletions(-) diff --git a/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c b/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c index 1b6c17dd81ee4..4d30a8d2bc236 100644 --- a/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c +++ b/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c @@ -2853,7 +2853,6 @@ static struct iommu_ops arm_smmu_ops = { .of_xlate = arm_smmu_of_xlate, .get_resv_regions = arm_smmu_get_resv_regions, .put_resv_regions = generic_iommu_put_resv_regions, - .dev_feat_enabled = arm_smmu_dev_feature_enabled, .dev_enable_feat= arm_smmu_dev_enable_feature, .dev_disable_feat = arm_smmu_dev_disable_feature, .sva_bind = arm_smmu_sva_bind, diff --git a/drivers/iommu/iommu.c b/drivers/iommu/iommu.c index 0aa141646bdf4..1bb016a6a2aa1 100644 --- a/drivers/iommu/iommu.c +++ b/drivers/iommu/iommu.c @@ -2760,19 +2760,6 @@ int iommu_dev_disable_feature(struct device *dev, enum iommu_dev_features feat) } EXPORT_SYMBOL_GPL(iommu_dev_disable_feature); -bool iommu_dev_feature_enabled(struct device *dev, enum iommu_dev_features feat) -{ - if (dev->iommu && dev->iommu->iommu_dev) { - const struct iommu_ops *ops = dev->iommu->iommu_dev->ops; - - if (ops->dev_feat_enabled) - return ops->dev_feat_enabled(dev, feat); - } - - return false; -} -EXPORT_SYMBOL_GPL(iommu_dev_feature_enabled); - /** * iommu_sva_bind_device() - Bind a process address space to a device * @dev: the device diff --git a/include/linux/iommu.h b/include/linux/iommu.h index a3acdb46b9391..0bc2eb14b0262 100644 --- a/include/linux/iommu.h +++ b/include/linux/iommu.h @@ -215,7 +215,6 @@ struct iommu_iotlb_gather { * driver init to device driver init (default no) * @dev_has/enable/disable_feat: per device entries to check/enable/disable ^^^ dev_has_feat() has been removed by the previous patch. * iommu specific features. - * @dev_feat_enabled: check enabled feature * @sva_bind: Bind process address space to device * @sva_unbind: Unbind process address space from device * @sva_get_pasid: Get PASID associated to a SVA handle @@ -247,7 +246,6 @@ struct iommu_ops { bool (*is_attach_deferred)(struct device *dev); /* Per device IOMMU features */ - bool (*dev_feat_enabled)(struct device *dev, enum iommu_dev_features f); int (*dev_enable_feat)(struct device *dev, enum iommu_dev_features f); int (*dev_disable_feat)(struct device *dev, enum iommu_dev_features f); @@ -670,7 +668,6 @@ void iommu_release_device(struct device *dev); int iommu_dev_enable_feature(struct device *dev, enum iommu_dev_features f); int iommu_dev_disable_feature(struct device *dev, enum iommu_dev_features f); -bool iommu_dev_feature_enabled(struct device *dev, enum iommu_dev_features f); struct iommu_sva *iommu_sva_bind_device(struct device *dev, struct mm_struct *mm, @@ -997,12 +994,6 @@ const struct iommu_ops *iommu_ops_from_fwnode(struct fwnode_handle *fwnode) return NULL; } -static inline bool -iommu_dev_feature_enabled(struct device *dev, enum iommu_dev_features feat) -{ - return false; -} - static inline int iommu_dev_enable_feature(struct device *dev, enum iommu_dev_features feat) { Best regards, baolu ___ iommu mailing list iommu@lists.linux-foundation.org https://lists.linuxfoundation.org/mailman/listinfo/iommu
Re: [PATCH 3/4] iommu: remove the put_resv_regions method
On 2022/7/8 16:06, Christoph Hellwig wrote: -void generic_iommu_put_resv_regions(struct device *dev, struct list_head *list) +void iommu_put_resv_regions(struct device *dev, struct list_head *list) { struct iommu_resv_region *entry, *next; @@ -2610,7 +2597,7 @@ void generic_iommu_put_resv_regions(struct device *dev, struct list_head *list) kfree(entry); } } -EXPORT_SYMBOL(generic_iommu_put_resv_regions); +EXPORT_SYMBOL(iommu_put_resv_regions); Do we really need to export this symbol? It is not used beyond the iommu core code. Best regards, baolu ___ iommu mailing list iommu@lists.linux-foundation.org https://lists.linuxfoundation.org/mailman/listinfo/iommu
Re: [PATCH v4 01/11] iommu/vt-d: debugfs: Remove device_domain_lock usage
On 2022/7/7 16:30, Ethan Zhao wrote: -static int show_device_domain_translation(struct device *dev, void *data) +static int __show_device_domain_translation(struct device *dev, void *data) { - struct device_domain_info *info = dev_iommu_priv_get(dev); - struct dmar_domain *domain = info->domain; + struct dmar_domain *domain; struct seq_file *m = data; u64 path[6] = { 0 }; + domain = to_dmar_domain(iommu_get_domain_for_dev(dev)); if (!domain) return 0; @@ -359,20 +359,39 @@ static int show_device_domain_translation(struct device *dev, void *data) pgtable_walk_level(m, domain->pgd, domain->agaw + 2, 0, path); seq_putc(m, '\n'); - return 0; + /* Don't iterate */ + return 1; } Using this return value trick to change the caller behaviour, seems not saving anything, but really cost me a few seconds more to know the *incantation* -- 'Don't iterate' :) . This is defined by iommu_group_for_each_dev(). Return value 0 means continuing to next one, while non-zero means stopping iteration. Best regards, baolu ___ iommu mailing list iommu@lists.linux-foundation.org https://lists.linuxfoundation.org/mailman/listinfo/iommu
Re: [PATCH v2 3/6] iommu/vt-d: Refactor iommu information of each domain
On 2022/7/7 09:01, Tian, Kevin wrote: From: Lu Baolu Sent: Saturday, July 2, 2022 9:56 AM -out_unlock: + set_bit(num, iommu->domain_ids); + info->refcnt = 1; + info->did= num; + info->iommu = iommu; + domain->nid = iommu->node; One nit. this line should be removed as it's incorrect to blindly update domain->nid and we should just leave to domain_update_iommu_cap() to decide the right node. Otherwise this causes a policy conflict as here it is the last attached device deciding the node which is different from domain_update_iommu_cap() which picks the node of the first attached device. Agreed and updated. Thank you! Otherwise, Reviewed-by: Kevin Tian Best regards, baolu ___ iommu mailing list iommu@lists.linux-foundation.org https://lists.linuxfoundation.org/mailman/listinfo/iommu
Re: [PATCH v2 03/14] iommu: Move bus setup to IOMMU device registration
Hi Robin, On 2022/4/30 02:06, Robin Murphy wrote: On 29/04/2022 9:50 am, Robin Murphy wrote: On 2022-04-29 07:57, Baolu Lu wrote: Hi Robin, On 2022/4/28 21:18, Robin Murphy wrote: Move the bus setup to iommu_device_register(). This should allow bus_iommu_probe() to be correctly replayed for multiple IOMMU instances, and leaves bus_set_iommu() as a glorified no-op to be cleaned up next. I re-fetched the latest patches on https://gitlab.arm.com/linux-arm/linux-rm/-/commits/iommu/bus and rolled back the head to "iommu: Cleanup bus_set_iommu". The test machine still hangs during boot. I went through the code. It seems that the .probe_device for Intel IOMMU driver can't handle the probe replay well. It always assumes that the device has never been probed. Hmm, but probe_iommu_group() is supposed to prevent the __iommu_probe_device() call even happening if the device *has* already been probed before :/ I've still got an old Intel box spare in the office so I'll rig that up and see if I can see what might be going on here... OK, on a Xeon with two DMAR units, this seems to boot OK with or without patch #1, so it doesn't seem to be a general problem with replaying in iommu_device_register(), or with platform devices. Not sure where to go from here... :/ The kernel boot panic message: [6.639432] BUG: kernel NULL pointer dereference, address: 0028 [6.743829] #PF: supervisor read access in kernel mode [6.743829] #PF: error_code(0x) - not-present page [6.743829] PGD 0 [6.743829] Oops: [#1] PREEMPT SMP NOPTI [6.743829] CPU: 0 PID: 1 Comm: swapper/0 Tainted: G I 5.19.0-rc3+ #182 [6.743829] RIP: 0010:__iommu_probe_device+0x115/0x200 [6.743829] Code: 89 ff e8 3e e1 ff ff 48 85 c0 0f 85 47 ff ff ff 41 bd f4 ff ff ff eb 83 48 8b 83 d8 02 00 00 48 89 df 48 8b 40 38 48 8b 40 10 <48> 8b 40 28 ff d0 0f 1f 00 48 89 c1 48 85 c0 0f 84 b7 00 00 00 48 [6.743829] RSP: :ff30605c00063d40 EFLAGS: 00010246 [6.743829] RAX: RBX: ff128b9c5fdc90d0 RCX: [6.743829] RDX: 8001 RSI: 0246 RDI: ff128b9c5fdc90d0 [6.743829] RBP: b60c34e0 R08: b68664d0 R09: ff128b9501d4ce40 [6.743829] R10: b6267096 R11: ff128b950014c267 R12: ff30605c00063de0 [6.743829] R13: 001b9d28 R14: ff128b95001b9d28 R15: ff128b9c5fdc93a8 [6.743829] FS: () GS:ff128b9c5fc0() knlGS: [6.743829] CS: 0010 DS: ES: CR0: 80050033 [6.743829] CR2: 0028 CR3: 000149210001 CR4: 00771ef0 [6.743829] DR0: DR1: DR2: [6.743829] DR3: DR6: 07f0 DR7: 0400 [6.743829] PKRU: 5554 [6.743829] Call Trace: [6.743829] [6.743829] ? _raw_spin_lock_irqsave+0x17/0x40 [6.743829] ? __iommu_probe_device+0x200/0x200 [6.743829] probe_iommu_group+0x2d/0x40 [6.743829] bus_for_each_dev+0x74/0xc0 [6.743829] bus_iommu_probe+0x48/0x2d0 [6.743829] iommu_device_register+0xde/0x120 [6.743829] intel_iommu_init+0x35f/0x5f2 [6.743829] ? iommu_setup+0x27d/0x27d [6.743829] ? rdinit_setup+0x2c/0x2c [6.743829] pci_iommu_init+0xe/0x32 [6.743829] do_one_initcall+0x41/0x200 [6.743829] kernel_init_freeable+0x1de/0x228 [6.743829] ? rest_init+0xc0/0xc0 [6.743829] kernel_init+0x16/0x120 [6.743829] ret_from_fork+0x1f/0x30 [6.743829] [6.743829] Modules linked in: [6.743829] CR2: 0028 [6.743829] ---[ end trace ]--- [6.743829] RIP: 0010:__iommu_probe_device+0x115/0x200 [6.743829] Code: 89 ff e8 3e e1 ff ff 48 85 c0 0f 85 47 ff ff ff 41 bd f4 ff ff ff eb 83 48 8b 83 d8 02 00 00 48 89 df 48 8b 40 38 48 8b 40 10 <48> 8b 40 28 ff d0 0f 1f 00 48 89 c1 48 85 c0 0f 84 b7 00 00 00 48 [6.743829] RSP: :ff30605c00063d40 EFLAGS: 00010246 [6.743829] RAX: RBX: ff128b9c5fdc90d0 RCX: [6.743829] RDX: 8001 RSI: 0246 RDI: ff128b9c5fdc90d0 [6.743829] RBP: b60c34e0 R08: b68664d0 R09: ff128b9501d4ce40 [6.743829] R10: b6267096 R11: ff128b950014c267 R12: ff30605c00063de0 [6.743829] R13: 001b9d28 R14: ff128b95001b9d28 R15: ff128b9c5fdc93a8 [6.743829] FS: () GS:ff128b9c5fc0() knlGS: [6.743829] CS: 0010 DS: ES: CR0: 80050033 [6.743829] CR2: 0028 CR3: 000149210001 CR4: 00771ef0 [6.743829] DR0: DR1: DR2: [6.743829] DR3: DR6: 07f0 DR7: 0400 [6.743829] PKRU: 5554 [6.743829] Kernel panic - not syncing: Fatal exception [6.743829] ---[ end Kern
Re: [PATCH v3 10/11] iommu/vt-d: Use device_domain_lock accurately
On 2022/7/1 16:15, Tian, Kevin wrote: From: Lu Baolu Sent: Wednesday, June 29, 2022 3:47 PM + spin_lock_irqsave(_domain_lock, flags); list_for_each_entry(info, >devices, link) { - if (!info->dev) - continue; - suppose you can replace all spin_lock_irqsave() with spin_lock() in patch5 instead of leaving some replacement to next patch. Make sense to me. Will update this. Best regards, baolu ___ iommu mailing list iommu@lists.linux-foundation.org https://lists.linuxfoundation.org/mailman/listinfo/iommu
Re: [PATCH v3 02/11] iommu/vt-d: Remove clearing translation data in disable_dmar_iommu()
On 2022/7/1 15:58, Tian, Kevin wrote: From: Lu Baolu Sent: Wednesday, June 29, 2022 3:47 PM The disable_dmar_iommu() is called when IOMMU initialization fails or the IOMMU is hot-removed from the system. In both cases, there is no need to clear the IOMMU translation data structures for devices. On the initialization path, the device probing only happens after the IOMMU is initialized successfully, hence there're no translation data structures. On the hot-remove path, there is no real use case where the IOMMU is hot-removed, but the devices that it manages are still alive in the system. The translation data structures were torn down during device release, hence there's no need to repeat it in IOMMU hot-remove path either. This removes the unnecessary code and only leaves a check. Signed-off-by: Lu Baolu You probably overlooked my last comment on kexec: https://lore.kernel.org/lkml/bl1pr11mb52711a71ad9f11b7ae42694c8c...@bl1pr11mb5271.namprd11.prod.outlook.com/ I think my question is still not answered. Sorry! I did overlook that comment. I can see your points now, though it seems to be irrelevant to the problems that this series tries to solve. The failure path of copying table still needs some improvement. At least the pages allocated for root/context tables should be freed in the failure path. Even worse, the software occupied a bit of page table entry which is feasible for the old ECS, but not work for the new scalable mode anymore. All these problems deserve a separate series. We could address your concerns there. Does this work for you? Best regards, baolu ___ iommu mailing list iommu@lists.linux-foundation.org https://lists.linuxfoundation.org/mailman/listinfo/iommu
Re: [PATCH v3 10/11] iommu/vt-d: Use device_domain_lock accurately
On 2022/7/1 16:15, Tian, Kevin wrote: From: Lu Baolu Sent: Wednesday, June 29, 2022 3:47 PM + spin_lock_irqsave(_domain_lock, flags); list_for_each_entry(info, >devices, link) { - if (!info->dev) - continue; - suppose you can replace all spin_lock_irqsave() with spin_lock() in patch5 instead of leaving some replacement to next patch. Make sense. I will update the series. Best regards, baolu ___ iommu mailing list iommu@lists.linux-foundation.org https://lists.linuxfoundation.org/mailman/listinfo/iommu
Re: [PATCH v3 00/11] iommu/vt-d: Optimize the use of locks
On 2022/7/1 15:53, Tian, Kevin wrote: From: Lu Baolu Sent: Wednesday, June 29, 2022 3:47 PM v3: - Split reduction of lock ranges from changing irqsave. https://lore.kernel.org/linux- iommu/BN9PR11MB52760A3D7C6BF1AF9C9D34658CAA9@BN9PR11MB5276. namprd11.prod.outlook.com/ - Fully initialize the dev_info before adding it to the list. https://lore.kernel.org/linux- iommu/BN9PR11MB52764D7CD86448C5E4EB46668CAA9@BN9PR11MB5276. namprd11.prod.outlook.com/ - Various code and comments refinement. This doesn't say why original patch2 was removed: "iommu/vt-d: Remove for_each_device_domain()" It took me a while to realize that it's already covered by your another patch fixing RID2PASID. My fault! I forgot to mention it in the change log. Sorry about it. Best regards, baolu ___ iommu mailing list iommu@lists.linux-foundation.org https://lists.linuxfoundation.org/mailman/listinfo/iommu
Re: [PATCH v2 5/7] iommu/vt-d: Fix suspicious RCU usage in probe_acpi_namespace_devices()
On 2022/6/29 21:03, Robin Murphy wrote: On 2019-06-12 01:28, Lu Baolu wrote: The drhd and device scope list should be iterated with the iommu global lock held. Otherwise, a suspicious RCU usage message will be displayed. [ 3.695886] = [ 3.695917] WARNING: suspicious RCU usage [ 3.695950] 5.2.0-rc2+ #2467 Not tainted [ 3.695981] - [ 3.696014] drivers/iommu/intel-iommu.c:4569 suspicious rcu_dereference_check() usage! [ 3.696069] other info that might help us debug this: [ 3.696126] rcu_scheduler_active = 2, debug_locks = 1 [ 3.696173] no locks held by swapper/0/1. [ 3.696204] stack backtrace: [ 3.696241] CPU: 0 PID: 1 Comm: swapper/0 Not tainted 5.2.0-rc2+ #2467 [ 3.696370] Call Trace: [ 3.696404] dump_stack+0x85/0xcb [ 3.696441] intel_iommu_init+0x128c/0x13ce [ 3.696478] ? kmem_cache_free+0x16b/0x2c0 [ 3.696516] ? __fput+0x14b/0x270 [ 3.696550] ? __call_rcu+0xb7/0x300 [ 3.696583] ? get_max_files+0x10/0x10 [ 3.696631] ? set_debug_rodata+0x11/0x11 [ 3.696668] ? e820__memblock_setup+0x60/0x60 [ 3.696704] ? pci_iommu_init+0x16/0x3f [ 3.696737] ? set_debug_rodata+0x11/0x11 [ 3.696770] pci_iommu_init+0x16/0x3f [ 3.696805] do_one_initcall+0x5d/0x2e4 [ 3.696844] ? set_debug_rodata+0x11/0x11 [ 3.696880] ? rcu_read_lock_sched_held+0x6b/0x80 [ 3.696924] kernel_init_freeable+0x1f0/0x27c [ 3.696961] ? rest_init+0x260/0x260 [ 3.696997] kernel_init+0xa/0x110 [ 3.697028] ret_from_fork+0x3a/0x50 Fixes: fa212a97f3a36 ("iommu/vt-d: Probe DMA-capable ACPI name space devices") Signed-off-by: Lu Baolu --- drivers/iommu/intel-iommu.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/drivers/iommu/intel-iommu.c b/drivers/iommu/intel-iommu.c index 19c4c387a3f6..84e650c6a46d 100644 --- a/drivers/iommu/intel-iommu.c +++ b/drivers/iommu/intel-iommu.c @@ -4793,8 +4793,10 @@ int __init intel_iommu_init(void) cpuhp_setup_state(CPUHP_IOMMU_INTEL_DEAD, "iommu/intel:dead", NULL, intel_iommu_cpu_dead); + down_read(_global_lock); if (probe_acpi_namespace_devices()) pr_warn("ACPI name space devices didn't probe correctly\n"); + up_read(_global_lock); Doing a bit of archaeology here, is this actually broken? If any ANDD entries exist, we'd end up doing: down_read(_global_lock) probe_acpi_namespace_devices() -> iommu_probe_device() -> iommu_create_device_direct_mappings() -> iommu_get_resv_regions() -> intel_iommu_get_resv_regions() -> down_read(_global_lock) I'm wondering whether this might explain why my bus_set_iommu series prevented Baolu's machine from booting, since "iommu: Move bus setup to IOMMU device registration" creates the same condition where we end up in get_resv_regions (via bus_iommu_probe() this time) from the same task that already holds dmar_global_lock. Of course that leaves me wondering how it *did* manage to boot OK on my Xeon box, but maybe there's a config difference or dumb luck at play? This is really problematic. Where does the latest bus_set_iommu series locate? I'd like to take a closer look at what happened here. Perhaps two weeks later? I'm busy with preparing Intel IOMMU patches for v5.20 these days. Best regards, baolu ___ iommu mailing list iommu@lists.linux-foundation.org https://lists.linuxfoundation.org/mailman/listinfo/iommu
Re: [PATCH v4 1/5] iommu: Return -EMEDIUMTYPE for incompatible domain and device/group
On 2022/7/1 04:36, Nicolin Chen wrote: Cases like VFIO wish to attach a device to an existing domain that was not allocated specifically from the device. This raises a condition where the IOMMU driver can fail the domain attach because the domain and device are incompatible with each other. This is a soft failure that can be resolved by using a different domain. Provide a dedicated errno from the IOMMU driver during attach that the reason attached failed is because of domain incompatability. EMEDIUMTYPE is chosen because it is never used within the iommu subsystem today and evokes a sense that the 'medium' aka the domain is incompatible. VFIO can use this to know attach is a soft failure and it should continue searching. Otherwise the attach will be a hard failure and VFIO will return the code to userspace. Update all drivers to return EMEDIUMTYPE in their failure paths that are related to domain incompatability. Also remove adjacent error prints for these soft failures, to prevent a kernel log spam, since -EMEDIUMTYPE is clear enough to indicate an incompatability error. Add kdocs describing this behavior. Suggested-by: Jason Gunthorpe Reviewed-by: Kevin Tian Signed-off-by: Nicolin Chen Reviewed-by: Lu Baolu Best regards, baolu ___ iommu mailing list iommu@lists.linux-foundation.org https://lists.linuxfoundation.org/mailman/listinfo/iommu
Re: [PATCH v1 4/6] iommu/vt-d: Add VTD_FLAG_IOMMU_PROBED flag
On 6/30/22 4:29 PM, Tian, Kevin wrote: From: Lu Baolu Sent: Saturday, June 25, 2022 8:52 PM In the IOMMU hot-add path, there's a need to check whether an IOMMU has been probed. Instead of checking the IOMMU pointer in the global list, it's better to allocate a flag bit in iommu->flags for this purpose. Sorry I didn't get the point of original check. This is the hotplug path hence the caller of this function should already figure out it's a new iommu before reaching this point? Either did I. It was added by below commit without any comments about this check. commit ffebeb46dd34736c90ffbca1ccb0bef8f4827c44 Author: Jiang Liu Date: Sun Nov 9 22:48:02 2014 +0800 iommu/vt-d: Enhance intel-iommu driver to support DMAR unit hotplug Implement required callback functions for intel-iommu driver to support DMAR unit hotplug. Signed-off-by: Jiang Liu Reviewed-by: Yijing Wang Signed-off-by: Joerg Roedel I went through the whole hot-add process and found this check seemed to be duplicate. Hot-add process starts from dmar_device_hotplug(), it uses a rwlock to synchronize the hot-add paths. 2386 down_write(_global_lock); 2387 if (insert) 2388 ret = dmar_hotplug_insert(tmp); 2389 else 2390 ret = dmar_hotplug_remove(tmp); 2391 up_write(_global_lock); dmar_device_hotplug() ->dmar_hotplug_insert() -->dmar_parse_one_drhd() /* the added intel_iommu is allocated here*/ -->dmar_hp_add_drhd() /* the intel_iommu is about to bring up */ --->intel_iommu_add() The duplicate check here: if (g_iommus[iommu->seq_id]) return 0; All the iommu units are allocated and then initialized in the same synchronized path. There is no need to check a duplicate initialization. I would like to remove this check if no objection. Best regards, baolu ___ iommu mailing list iommu@lists.linux-foundation.org https://lists.linuxfoundation.org/mailman/listinfo/iommu
Re: [PATCH v9 04/11] iommu: Add sva iommu_domain support
On 2022/6/29 09:54, Tian, Kevin wrote: From: Baolu Lu Sent: Tuesday, June 28, 2022 7:34 PM On 2022/6/28 16:50, Tian, Kevin wrote: From: Baolu Lu Sent: Tuesday, June 28, 2022 1:41 PM struct iommu_domain { unsigned type; const struct iommu_domain_ops *ops; unsigned long pgsize_bitmap;/* Bitmap of page sizes in use */ - iommu_fault_handler_t handler; - void *handler_token; struct iommu_domain_geometry geometry; struct iommu_dma_cookie *iova_cookie; + union { + struct {/* IOMMU_DOMAIN_DMA */ + iommu_fault_handler_t handler; + void *handler_token; + }; why is it DMA domain specific? What about unmanaged domain? Unrecoverable fault can happen on any type including SVA. Hence I think above should be domain type agnostic. The report_iommu_fault() should be replaced by the new iommu_report_device_fault(). Jean has already started this work. https://lore.kernel.org/linux-iommu/Yo4Nw9QyllT1RZbd@myrica/ Currently this is only for DMA domains, hence Robin suggested to make it exclude with the SVA domain things. https://lore.kernel.org/linux-iommu/f3170016-4d7f-e78e-db48- 68305f683...@arm.com/ Then it's worthy a comment that those two fields are for some legacy fault reporting stuff and DMA type only. The iommu_fault and SVA fields are exclusive. The former is used for unrecoverable DMA remapping faults, while the latter is only interested in the recoverable page faults. I will update the commit message with above explanation. Does this work for you? Not exactly. Your earlier explanation is about old vs. new API thus leaving the existing fault handler with current only user is fine. but this is not related to unrecoverable vs. recoverable. As I said unrecoverable could happen on all domain types. Tying it to DMA-only doesn't make sense and I think in the end the new iommu_report_device_fault() will need support both. Is it not the case? You are right. The report_iommu_fault() should be replaced by the new iommu_report_device_fault(). Leave the existing fault handler with the existing users and the newly added SVA members should exclude it. Best regards, baolu ___ iommu mailing list iommu@lists.linux-foundation.org https://lists.linuxfoundation.org/mailman/listinfo/iommu
Re: [PATCH v9 10/11] iommu: Per-domain I/O page fault handling
On 2022/6/28 22:20, Jean-Philippe Brucker wrote: On Tue, Jun 28, 2022 at 07:53:39PM +0800, Baolu Lu wrote: Once the iopf_handle_single() is removed, the name of iopf_handle_group() looks a little weired and confused, does this group mean the iommu group (domain) ? while I take some minutes to No. This is not the iommu group. It's page request group defined by the PCI SIG spec. Multiple page requests could be put in a group with a same group id. All page requests in a group could be responded to device in one shot. Thanks your explaination, understand the concept of PCIe PRG. I meant do we still have the necessity to mention the "group" here in the name iopf_handle_group(), which one is better ? iopf_handle_prg() or iopf_handler(), perhaps none of them ? :) Oh! Sorry for the misunderstanding. I have no strong feeling to change this naming. :-) All the names express what the helper does. Jean is the author of this framework. If he has the same idea as you, I don't mind renaming it in this patch. I'm not attached to the name, and I see how it could be confusing. Given that io-pgfault is not only for PCIe, 'prg' is not the best here either. iopf_handle_faults(), or just iopf_handler(), seem more suitable. Okay, so I will rename it to iopf_handle_faults() in this patch. Best regards, baolu ___ iommu mailing list iommu@lists.linux-foundation.org https://lists.linuxfoundation.org/mailman/listinfo/iommu
Re: [PATCH v1 0/6] iommu/vt-d: Reset DMAR_UNITS_SUPPORTED
On 2022/6/25 20:51, Lu Baolu wrote: Hi folks, This is a follow-up series of changes proposed by this patch: https://lore.kernel.org/linux-iommu/20220615183650.32075-1-steve.w...@hpe.com/ It removes several static arrays of size DMAR_UNITS_SUPPORTED and sets the DMAR_UNITS_SUPPORTED to 1024. Please help review and suggest. This series is also available on github: https://github.com/LuBaolu/intel-iommu/commits/vtd-next-for-v5.20 Best regards, baolu Best regards, baolu Lu Baolu (6): iommu/vt-d: Remove unused domain_get_iommu() iommu/vt-d: Use IDA interface to manage iommu sequence id iommu/vt-d: Refactor iommu information of each domain iommu/vt-d: Add VTD_FLAG_IOMMU_PROBED flag iommu/vt-d: Remove global g_iommus array iommu/vt-d: Make DMAR_UNITS_SUPPORTED default 1024 include/linux/dmar.h| 6 +- drivers/iommu/intel/iommu.h | 29 -- drivers/iommu/intel/dmar.c | 33 ++- drivers/iommu/intel/iommu.c | 188 ++-- drivers/iommu/intel/pasid.c | 2 +- drivers/iommu/intel/svm.c | 2 +- 6 files changed, 103 insertions(+), 157 deletions(-) ___ iommu mailing list iommu@lists.linux-foundation.org https://lists.linuxfoundation.org/mailman/listinfo/iommu
Re: [PATCH v9 10/11] iommu: Per-domain I/O page fault handling
On 2022/6/28 18:02, Tian, Kevin wrote: From: Jean-Philippe Brucker Sent: Tuesday, June 28, 2022 5:44 PM On Tue, Jun 28, 2022 at 08:39:36AM +, Tian, Kevin wrote: From: Lu Baolu Sent: Tuesday, June 21, 2022 10:44 PM Tweak the I/O page fault handling framework to route the page faults to the domain and call the page fault handler retrieved from the domain. This makes the I/O page fault handling framework possible to serve more usage scenarios as long as they have an IOMMU domain and install a page fault handler in it. Some unused functions are also removed to avoid dead code. The iommu_get_domain_for_dev_pasid() which retrieves attached domain for a {device, PASID} pair is used. It will be used by the page fault handling framework which knows {device, PASID} reported from the iommu driver. We have a guarantee that the SVA domain doesn't go away during IOPF handling, because unbind() waits for pending faults with iopf_queue_flush_dev() before freeing the domain. Hence, there's no need to synchronize life cycle of the iommu domains between the unbind() and the interrupt threads. I found iopf_queue_flush_dev() is only called in intel-iommu driver. Did I overlook anything? The SMMU driver will need it as well when we upstream PRI support. Currently it only supports stall, and that requires the device driver to flush all DMA including stalled transactions *before* calling unbind(), so ne need for iopf_queue_flush_dev() in this case. then it makes sense. Probably Baolu can add this information in the commit msg so others with similar question can quickly get the point here. Sure. Updated. Best regards, baolu ___ iommu mailing list iommu@lists.linux-foundation.org https://lists.linuxfoundation.org/mailman/listinfo/iommu
Re: [PATCH v9 10/11] iommu: Per-domain I/O page fault handling
On 2022/6/28 17:10, Ethan Zhao wrote: Hi, Baolu 在 2022/6/28 14:28, Baolu Lu 写道: Hi Ethan, On 2022/6/27 21:03, Ethan Zhao wrote: Hi, 在 2022/6/21 22:43, Lu Baolu 写道: Tweak the I/O page fault handling framework to route the page faults to the domain and call the page fault handler retrieved from the domain. This makes the I/O page fault handling framework possible to serve more usage scenarios as long as they have an IOMMU domain and install a page fault handler in it. Some unused functions are also removed to avoid dead code. The iommu_get_domain_for_dev_pasid() which retrieves attached domain for a {device, PASID} pair is used. It will be used by the page fault handling framework which knows {device, PASID} reported from the iommu driver. We have a guarantee that the SVA domain doesn't go away during IOPF handling, because unbind() waits for pending faults with iopf_queue_flush_dev() before freeing the domain. Hence, there's no need to synchronize life cycle of the iommu domains between the unbind() and the interrupt threads. Signed-off-by: Lu Baolu Reviewed-by: Jean-Philippe Brucker --- drivers/iommu/io-pgfault.c | 64 +- 1 file changed, 7 insertions(+), 57 deletions(-) diff --git a/drivers/iommu/io-pgfault.c b/drivers/iommu/io-pgfault.c index aee9e033012f..4f24ec703479 100644 --- a/drivers/iommu/io-pgfault.c +++ b/drivers/iommu/io-pgfault.c @@ -69,69 +69,18 @@ static int iopf_complete_group(struct device *dev, struct iopf_fault *iopf, return iommu_page_response(dev, ); } -static enum iommu_page_response_code -iopf_handle_single(struct iopf_fault *iopf) -{ - vm_fault_t ret; - struct mm_struct *mm; - struct vm_area_struct *vma; - unsigned int access_flags = 0; - unsigned int fault_flags = FAULT_FLAG_REMOTE; - struct iommu_fault_page_request *prm = >fault.prm; - enum iommu_page_response_code status = IOMMU_PAGE_RESP_INVALID; - - if (!(prm->flags & IOMMU_FAULT_PAGE_REQUEST_PASID_VALID)) - return status; - - mm = iommu_sva_find(prm->pasid); - if (IS_ERR_OR_NULL(mm)) - return status; - - mmap_read_lock(mm); - - vma = find_extend_vma(mm, prm->addr); - if (!vma) - /* Unmapped area */ - goto out_put_mm; - - if (prm->perm & IOMMU_FAULT_PERM_READ) - access_flags |= VM_READ; - - if (prm->perm & IOMMU_FAULT_PERM_WRITE) { - access_flags |= VM_WRITE; - fault_flags |= FAULT_FLAG_WRITE; - } - - if (prm->perm & IOMMU_FAULT_PERM_EXEC) { - access_flags |= VM_EXEC; - fault_flags |= FAULT_FLAG_INSTRUCTION; - } - - if (!(prm->perm & IOMMU_FAULT_PERM_PRIV)) - fault_flags |= FAULT_FLAG_USER; - - if (access_flags & ~vma->vm_flags) - /* Access fault */ - goto out_put_mm; - - ret = handle_mm_fault(vma, prm->addr, fault_flags, NULL); - status = ret & VM_FAULT_ERROR ? IOMMU_PAGE_RESP_INVALID : - IOMMU_PAGE_RESP_SUCCESS; - -out_put_mm: - mmap_read_unlock(mm); - mmput(mm); - - return status; -} - Once the iopf_handle_single() is removed, the name of iopf_handle_group() looks a little weired and confused, does this group mean the iommu group (domain) ? while I take some minutes to No. This is not the iommu group. It's page request group defined by the PCI SIG spec. Multiple page requests could be put in a group with a same group id. All page requests in a group could be responded to device in one shot. Thanks your explaination, understand the concept of PCIe PRG. I meant do we still have the necessity to mention the "group" here in the name iopf_handle_group(), which one is better ? iopf_handle_prg() or iopf_handler(), perhaps none of them ? :) Oh! Sorry for the misunderstanding. I have no strong feeling to change this naming. :-) All the names express what the helper does. Jean is the author of this framework. If he has the same idea as you, I don't mind renaming it in this patch. Best regards, baolu ___ iommu mailing list iommu@lists.linux-foundation.org https://lists.linuxfoundation.org/mailman/listinfo/iommu
Re: [PATCH v9 04/11] iommu: Add sva iommu_domain support
On 2022/6/28 16:50, Tian, Kevin wrote: + + mutex_lock(>mutex); + curr = xa_cmpxchg(>pasid_array, pasid, NULL, domain, GFP_KERNEL); + if (curr) + goto out_unlock; Need check xa_is_err(old). Either (1) old entry is a valid pointer, or return -EBUSY in this case (2) xa_is_err(curr) return xa_err(cur) are failure cases. Hence, "curr == NULL" is the only check we need. Did I miss anything? But now you always return -EBUSY for all kinds of xa errors. Fair enough. Updated like below. curr = xa_cmpxchg(>pasid_array, pasid, NULL, domain, GFP_KERNEL); if (curr) { ret = xa_err(curr) ? : -EBUSY; goto out_unlock; } Best regards, baolu ___ iommu mailing list iommu@lists.linux-foundation.org https://lists.linuxfoundation.org/mailman/listinfo/iommu
Re: [PATCH v9 04/11] iommu: Add sva iommu_domain support
On 2022/6/28 16:50, Tian, Kevin wrote: From: Baolu Lu Sent: Tuesday, June 28, 2022 1:41 PM struct iommu_domain { unsigned type; const struct iommu_domain_ops *ops; unsigned long pgsize_bitmap;/* Bitmap of page sizes in use */ - iommu_fault_handler_t handler; - void *handler_token; struct iommu_domain_geometry geometry; struct iommu_dma_cookie *iova_cookie; + union { + struct {/* IOMMU_DOMAIN_DMA */ + iommu_fault_handler_t handler; + void *handler_token; + }; why is it DMA domain specific? What about unmanaged domain? Unrecoverable fault can happen on any type including SVA. Hence I think above should be domain type agnostic. The report_iommu_fault() should be replaced by the new iommu_report_device_fault(). Jean has already started this work. https://lore.kernel.org/linux-iommu/Yo4Nw9QyllT1RZbd@myrica/ Currently this is only for DMA domains, hence Robin suggested to make it exclude with the SVA domain things. https://lore.kernel.org/linux-iommu/f3170016-4d7f-e78e-db48- 68305f683...@arm.com/ Then it's worthy a comment that those two fields are for some legacy fault reporting stuff and DMA type only. The iommu_fault and SVA fields are exclusive. The former is used for unrecoverable DMA remapping faults, while the latter is only interested in the recoverable page faults. I will update the commit message with above explanation. Does this work for you? Best regards, baolu ___ iommu mailing list iommu@lists.linux-foundation.org https://lists.linuxfoundation.org/mailman/listinfo/iommu
Re: [PATCH v9 10/11] iommu: Per-domain I/O page fault handling
On 2022/6/28 16:39, Tian, Kevin wrote: static void iopf_handle_group(struct work_struct *work) { struct iopf_group *group; + struct iommu_domain *domain; struct iopf_fault *iopf, *next; enum iommu_page_response_code status = IOMMU_PAGE_RESP_SUCCESS; group = container_of(work, struct iopf_group, work); + domain = iommu_get_domain_for_dev_pasid(group->dev, + group->last_fault.fault.prm.pasid); + if (!domain || !domain->iopf_handler) + status = IOMMU_PAGE_RESP_INVALID; Miss a comment on why no refcnt is required on domain as explained in the commit msg. I had some comments around iommu_queue_iopf() in the previous patch. The iommu_queue_iopf() is the generic page fault handler exposed by iommu core, hence that's the right place to document this. Post it below as well: diff --git a/drivers/iommu/io-pgfault.c b/drivers/iommu/io-pgfault.c index 1df8c1dcae77..aee9e033012f 100644 --- a/drivers/iommu/io-pgfault.c +++ b/drivers/iommu/io-pgfault.c @@ -181,6 +181,13 @@ static void iopf_handle_group(struct work_struct *work) * request completes, outstanding faults will have been dealt with by the time * the PASID is freed. * + * Any valid page fault will be eventually routed to an iommu domain and the + * page fault handler installed there will get called. The users of this + * handling framework should guarantee that the iommu domain could only be + * freed after the device has stopped generating page faults (or the iommu + * hardware has been set to block the page faults) and the pending page faults + * have been flushed. + * * Return: 0 on success and <0 on error. */ int iommu_queue_iopf(struct iommu_fault *fault, void *cookie) Best regards, baolu ___ iommu mailing list iommu@lists.linux-foundation.org https://lists.linuxfoundation.org/mailman/listinfo/iommu
Re: [PATCH v9 09/11] iommu: Prepare IOMMU domain for IOPF
On 2022/6/28 16:29, Tian, Kevin wrote: From: Lu Baolu Sent: Tuesday, June 21, 2022 10:44 PM +/* + * I/O page fault handler for SVA + */ +enum iommu_page_response_code +iommu_sva_handle_iopf(struct iommu_fault *fault, void *data) +{ + vm_fault_t ret; + struct mm_struct *mm; + struct vm_area_struct *vma; + unsigned int access_flags = 0; + struct iommu_domain *domain = data; + unsigned int fault_flags = FAULT_FLAG_REMOTE; + struct iommu_fault_page_request *prm = >prm; + enum iommu_page_response_code status = IOMMU_PAGE_RESP_INVALID; + + if (!(prm->flags & IOMMU_FAULT_PAGE_REQUEST_PASID_VALID)) + return status; + + mm = domain->mm; What about directly passing domain->mm in as the fault data? The entire logic here is only about mm instead of domain. Yes. Updated. Best regards, baolu ___ iommu mailing list iommu@lists.linux-foundation.org https://lists.linuxfoundation.org/mailman/listinfo/iommu
Re: [PATCH v9 10/11] iommu: Per-domain I/O page fault handling
Hi Ethan, On 2022/6/27 21:03, Ethan Zhao wrote: Hi, 在 2022/6/21 22:43, Lu Baolu 写道: Tweak the I/O page fault handling framework to route the page faults to the domain and call the page fault handler retrieved from the domain. This makes the I/O page fault handling framework possible to serve more usage scenarios as long as they have an IOMMU domain and install a page fault handler in it. Some unused functions are also removed to avoid dead code. The iommu_get_domain_for_dev_pasid() which retrieves attached domain for a {device, PASID} pair is used. It will be used by the page fault handling framework which knows {device, PASID} reported from the iommu driver. We have a guarantee that the SVA domain doesn't go away during IOPF handling, because unbind() waits for pending faults with iopf_queue_flush_dev() before freeing the domain. Hence, there's no need to synchronize life cycle of the iommu domains between the unbind() and the interrupt threads. Signed-off-by: Lu Baolu Reviewed-by: Jean-Philippe Brucker --- drivers/iommu/io-pgfault.c | 64 +- 1 file changed, 7 insertions(+), 57 deletions(-) diff --git a/drivers/iommu/io-pgfault.c b/drivers/iommu/io-pgfault.c index aee9e033012f..4f24ec703479 100644 --- a/drivers/iommu/io-pgfault.c +++ b/drivers/iommu/io-pgfault.c @@ -69,69 +69,18 @@ static int iopf_complete_group(struct device *dev, struct iopf_fault *iopf, return iommu_page_response(dev, ); } -static enum iommu_page_response_code -iopf_handle_single(struct iopf_fault *iopf) -{ - vm_fault_t ret; - struct mm_struct *mm; - struct vm_area_struct *vma; - unsigned int access_flags = 0; - unsigned int fault_flags = FAULT_FLAG_REMOTE; - struct iommu_fault_page_request *prm = >fault.prm; - enum iommu_page_response_code status = IOMMU_PAGE_RESP_INVALID; - - if (!(prm->flags & IOMMU_FAULT_PAGE_REQUEST_PASID_VALID)) - return status; - - mm = iommu_sva_find(prm->pasid); - if (IS_ERR_OR_NULL(mm)) - return status; - - mmap_read_lock(mm); - - vma = find_extend_vma(mm, prm->addr); - if (!vma) - /* Unmapped area */ - goto out_put_mm; - - if (prm->perm & IOMMU_FAULT_PERM_READ) - access_flags |= VM_READ; - - if (prm->perm & IOMMU_FAULT_PERM_WRITE) { - access_flags |= VM_WRITE; - fault_flags |= FAULT_FLAG_WRITE; - } - - if (prm->perm & IOMMU_FAULT_PERM_EXEC) { - access_flags |= VM_EXEC; - fault_flags |= FAULT_FLAG_INSTRUCTION; - } - - if (!(prm->perm & IOMMU_FAULT_PERM_PRIV)) - fault_flags |= FAULT_FLAG_USER; - - if (access_flags & ~vma->vm_flags) - /* Access fault */ - goto out_put_mm; - - ret = handle_mm_fault(vma, prm->addr, fault_flags, NULL); - status = ret & VM_FAULT_ERROR ? IOMMU_PAGE_RESP_INVALID : - IOMMU_PAGE_RESP_SUCCESS; - -out_put_mm: - mmap_read_unlock(mm); - mmput(mm); - - return status; -} - Once the iopf_handle_single() is removed, the name of iopf_handle_group() looks a little weired and confused, does this group mean the iommu group (domain) ? while I take some minutes to No. This is not the iommu group. It's page request group defined by the PCI SIG spec. Multiple page requests could be put in a group with a same group id. All page requests in a group could be responded to device in one shot. Best regards, baolu look into the code, oh, means a batch / list / queue of iopfs , and iopf_handle_group() becomes a generic iopf_handler() . Doe it make sense to revise the names of iopf_handle_group(), iopf_complete_group, iopf_group in this patch set ? Thanks, Ethan static void iopf_handle_group(struct work_struct *work) { struct iopf_group *group; + struct iommu_domain *domain; struct iopf_fault *iopf, *next; enum iommu_page_response_code status = IOMMU_PAGE_RESP_SUCCESS; group = container_of(work, struct iopf_group, work); + domain = iommu_get_domain_for_dev_pasid(group->dev, + group->last_fault.fault.prm.pasid); + if (!domain || !domain->iopf_handler) + status = IOMMU_PAGE_RESP_INVALID; list_for_each_entry_safe(iopf, next, >faults, list) { /* @@ -139,7 +88,8 @@ static void iopf_handle_group(struct work_struct *work) * faults in the group if there is an error. */ if (status == IOMMU_PAGE_RESP_SUCCESS) - status = iopf_handle_single(iopf); + status = domain->iopf_handler(>fault, + domain->fault_data); if (!(iopf->fault.prm.flags & IOMMU_FAULT_PAGE_REQUEST_LAST_PAGE)) ___ iommu mailing list iommu@lists.linux-foundation.org https://lists.linuxfoundation.org/mailman/listinfo/iommu
Re: [PATCH v9 06/11] arm-smmu-v3/sva: Add SVA domain support
On 2022/6/27 19:50, Zhangfei Gao wrote: On 2022/6/21 下午10:43, Lu Baolu wrote: Add support for SVA domain allocation and provide an SVA-specific iommu_domain_ops. Signed-off-by: Lu Baolu Reviewed-by: Jean-Philippe Brucker Tested-by: Zhangfei Gao Have tested the series on aarch64. Tony has been helping to validate this series on Intel's platform. Tony, can I add your Test-by as well in this series? Best regards, baolu ___ iommu mailing list iommu@lists.linux-foundation.org https://lists.linuxfoundation.org/mailman/listinfo/iommu
Re: [PATCH v9 07/11] iommu/sva: Refactoring iommu_sva_bind/unbind_device()
On 2022/6/27 18:14, Tian, Kevin wrote: From: Lu Baolu Sent: Tuesday, June 21, 2022 10:44 PM +struct iommu_sva *iommu_sva_bind_device(struct device *dev, struct mm_struct *mm) +{ + struct iommu_domain *domain; + ioasid_t max_pasids; + int ret = -EINVAL; + + /* Allocate mm->pasid if necessary. */ this comment is for iommu_sva_alloc_pasid() Updated. + max_pasids = dev->iommu->max_pasids; + if (!max_pasids) + return ERR_PTR(-EOPNOTSUPP); + + ret = iommu_sva_alloc_pasid(mm, 1, max_pasids - 1); + if (ret) + return ERR_PTR(ret); + ... +void iommu_sva_unbind_device(struct iommu_sva *handle) +{ + struct device *dev = handle->dev; + struct iommu_domain *domain = + container_of(handle, struct iommu_domain, bond); + ioasid_t pasid = iommu_sva_get_pasid(handle); + + mutex_lock(_sva_lock); + if (refcount_dec_and_test(>bond.users)) { + iommu_detach_device_pasid(domain, dev, pasid); + iommu_domain_free(domain); + } + mutex_unlock(_sva_lock); +} +EXPORT_SYMBOL_GPL(iommu_sva_unbind_device); + +u32 iommu_sva_get_pasid(struct iommu_sva *handle) +{ + struct iommu_domain *domain = + container_of(handle, struct iommu_domain, bond); + + return domain->mm->pasid; +} +EXPORT_SYMBOL_GPL(iommu_sva_get_pasid); Looks this is only used by unbind_device. Just open code it. It's part of current IOMMU/SVA interfaces for the device drivers, and has been used in various drivers. $ git grep iommu_sva_get_pasid drivers/dma/idxd/cdev.c:pasid = iommu_sva_get_pasid(sva); drivers/iommu/iommu-sva-lib.c: ioasid_t pasid = iommu_sva_get_pasid(handle); drivers/iommu/iommu-sva-lib.c:u32 iommu_sva_get_pasid(struct iommu_sva *handle) drivers/iommu/iommu-sva-lib.c:EXPORT_SYMBOL_GPL(iommu_sva_get_pasid); drivers/misc/uacce/uacce.c: pasid = iommu_sva_get_pasid(handle); include/linux/iommu.h:u32 iommu_sva_get_pasid(struct iommu_sva *handle); include/linux/iommu.h:static inline u32 iommu_sva_get_pasid(struct iommu_sva *handle) Or, I missed anything? Best regards, baolu ___ iommu mailing list iommu@lists.linux-foundation.org https://lists.linuxfoundation.org/mailman/listinfo/iommu
Re: [PATCH v9 06/11] arm-smmu-v3/sva: Add SVA domain support
On 2022/6/27 19:50, Zhangfei Gao wrote: On 2022/6/21 下午10:43, Lu Baolu wrote: Add support for SVA domain allocation and provide an SVA-specific iommu_domain_ops. Signed-off-by: Lu Baolu Reviewed-by: Jean-Philippe Brucker Tested-by: Zhangfei Gao Have tested the series on aarch64. Thank you very much! Very appreciated for your help! Best regards, baolu ___ iommu mailing list iommu@lists.linux-foundation.org https://lists.linuxfoundation.org/mailman/listinfo/iommu
Re: [PATCH v9 04/11] iommu: Add sva iommu_domain support
Hi Kevin, On 2022/6/27 16:29, Tian, Kevin wrote: From: Lu Baolu Sent: Tuesday, June 21, 2022 10:44 PM The sva iommu_domain represents a hardware pagetable that the IOMMU hardware could use for SVA translation. This adds some infrastructure to support SVA domain in the iommu common layer. It includes: - Extend the iommu_domain to support a new IOMMU_DOMAIN_SVA domain type. The IOMMU drivers that support SVA should provide the sva domain specific iommu_domain_ops. - Add a helper to allocate an SVA domain. The iommu_domain_free() is still used to free an SVA domain. - Add helpers to attach an SVA domain to a device and the reverse operation. Some buses, like PCI, route packets without considering the PASID value. Thus a DMA target address with PASID might be treated as P2P if the address falls into the MMIO BAR of other devices in the group. To make things simple, the attach/detach interfaces only apply to devices belonging to the singleton groups, and the singleton is immutable in fabric i.e. not affected by hotplug. The iommu_attach/detach_device_pasid() can be used for other purposes, such as kernel DMA with pasid, mediation device, etc. I'd split this into two patches. One for adding iommu_attach/ detach_device_pasid() and set/block_dev_pasid ops, and the other for adding SVA. Yes. Make sense. struct iommu_domain { unsigned type; const struct iommu_domain_ops *ops; unsigned long pgsize_bitmap;/* Bitmap of page sizes in use */ - iommu_fault_handler_t handler; - void *handler_token; struct iommu_domain_geometry geometry; struct iommu_dma_cookie *iova_cookie; + union { + struct {/* IOMMU_DOMAIN_DMA */ + iommu_fault_handler_t handler; + void *handler_token; + }; why is it DMA domain specific? What about unmanaged domain? Unrecoverable fault can happen on any type including SVA. Hence I think above should be domain type agnostic. The report_iommu_fault() should be replaced by the new iommu_report_device_fault(). Jean has already started this work. https://lore.kernel.org/linux-iommu/Yo4Nw9QyllT1RZbd@myrica/ Currently this is only for DMA domains, hence Robin suggested to make it exclude with the SVA domain things. https://lore.kernel.org/linux-iommu/f3170016-4d7f-e78e-db48-68305f683...@arm.com/ + struct {/* IOMMU_DOMAIN_SVA */ + struct mm_struct *mm; + }; + }; }; + +struct iommu_domain *iommu_sva_domain_alloc(struct device *dev, + struct mm_struct *mm) +{ + const struct iommu_ops *ops = dev_iommu_ops(dev); + struct iommu_domain *domain; + + domain = ops->domain_alloc(IOMMU_DOMAIN_SVA); + if (!domain) + return NULL; + + domain->type = IOMMU_DOMAIN_SVA; It's a bit weird that the type has been specified when calling ops->domain_alloc while it still leaves to the caller to set the type. But this is not caused by this series. could be cleaned up separately. Yes. Robin has patches to refactor the domain allocation interface, let's wait and see what it looks like finally. + + mutex_lock(>mutex); + curr = xa_cmpxchg(>pasid_array, pasid, NULL, domain, GFP_KERNEL); + if (curr) + goto out_unlock; Need check xa_is_err(old). Either (1) old entry is a valid pointer, or (2) xa_is_err(curr) are failure cases. Hence, "curr == NULL" is the only check we need. Did I miss anything? Best regards, baolu ___ iommu mailing list iommu@lists.linux-foundation.org https://lists.linuxfoundation.org/mailman/listinfo/iommu
Re: [PATCH v9 00/11] iommu: SVA and IOPF refactoring
Hi folks, On 2022/6/21 22:43, Lu Baolu wrote: Hi folks, The former part of this series refactors the IOMMU SVA code by assigning an SVA type of iommu_domain to a shared virtual address and replacing sva_bind/unbind iommu ops with set/block_dev_pasid domain ops. The latter part changes the existing I/O page fault handling framework from only serving SVA to a generic one. Any driver or component could handle the I/O page faults for its domain in its own way by installing an I/O page fault handler. This series has been functionally tested on an x86 machine and compile tested for all architectures. This series is also available on github: [2]https://github.com/LuBaolu/intel-iommu/commits/iommu-sva-refactoring-v9 Please review and suggest. Just a gentle ping on this series. Do you have further inputs? I am trying to see if we can merge this series for v5.20. The drivers also depend on it to enable their kernel DMA with PASID. Sorry to disturb you. Best regards, baolu ___ iommu mailing list iommu@lists.linux-foundation.org https://lists.linuxfoundation.org/mailman/listinfo/iommu
Re: [PATCH v3 1/1] iommu/vt-d: Fix RID2PASID setup/teardown failure
On 2022/6/24 14:02, Ethan Zhao wrote: 在 2022/6/23 14:57, Lu Baolu 写道: The IOMMU driver shares the pasid table for PCI alias devices. When the RID2PASID entry of the shared pasid table has been filled by the first device, the subsequent device will encounter the "DMAR: Setup RID2PASID failed" failure as the pasid entry has already been marked as present. As the result, the IOMMU probing process will be aborted. On the contrary, when any alias device is hot-removed from the system, for example, by writing to /sys/bus/pci/devices/.../remove, the shared RID2PASID will be cleared without any notifications to other devices. As the result, any DMAs from those rest devices are blocked. Sharing pasid table among PCI alias devices could save two memory pages for devices underneath the PCIe-to-PCI bridges. Anyway, considering that those devices are rare on modern platforms that support VT-d in scalable mode and the saved memory is negligible, it's reasonable to remove this part of immature code to make the driver feasible and stable. In my understanding, thus cleanning will make the pasid table become per-dev datastructure whatever the dev is pci-alias or not, and the pasid_pte_is_present(pte)will only check against every pci-alias' own private pasid table,the setup stagewouldn't break, so does the detach/release path, and little value to code otherreference counter like complex implenmataion, looks good to me ! Thanks! Can I add a Reviewd-by from you? Best regards, baolu ___ iommu mailing list iommu@lists.linux-foundation.org https://lists.linuxfoundation.org/mailman/listinfo/iommu
Re: [PATCH] iommu/vt-d: Fix PCI bus rescan device hot add
Hi Joerg, On 2022/6/24 13:45, Joerg Roedel wrote: Hi Baolu, On Wed, May 25, 2022 at 09:40:26AM +0800, Baolu Lu wrote: How do you like it? If you agree, I can queue it in my next pull request for fixes. Would it help to tie DMAR and IOMMU components together, so that selecting DMAR for IRQ remapping also selects IOMMU? The IOMMU can be in PT mode and I think it would simplify a lot of things. It makes sense as far as I am aware. By putting IOMMUs in pass-through mode, there will be no run-time costs and things could be simplified a lot. Besides the refactoring efforts, we still need this quick fix so that the fix could be propagated to various stable and vendors' downstream trees. Best regards, baolu ___ iommu mailing list iommu@lists.linux-foundation.org https://lists.linuxfoundation.org/mailman/listinfo/iommu
Re: [PATCH v3 2/5] vfio/iommu_type1: Prefer to reuse domains vs match enforced cache coherency
On 2022/6/24 04:00, Nicolin Chen wrote: From: Jason Gunthorpe The KVM mechanism for controlling wbinvd is based on OR of the coherency property of all devices attached to a guest, no matter whether those devices are attached to a single domain or multiple domains. On the other hand, the benefit to using separate domains was that those devices attached to domains supporting enforced cache coherency always mapped with the attributes necessary to provide that feature, therefore if a non-enforced domain was dropped, the associated group removal would re-trigger an evaluation by KVM. In practice however, the only known cases of such mixed domains included an Intel IGD device behind an IOMMU lacking snoop control, where such devices do not support hotplug, therefore this scenario lacks testing and is not considered sufficiently relevant to support. After all, KVM won't take advantage of trying to push a device that could do enforced cache coherency to a dedicated domain vs re-using an existing domain, which is non-coherent. Simplify this code and eliminate the test. This removes the only logic that needed to have a dummy domain attached prior to searching for a matching domain and simplifies the next patches. It's unclear whether we want to further optimize the Intel driver to update the domain coherency after a device is detached from it, at least not before KVM can be verified to handle such dynamics in related emulation paths (wbinvd, vcpu load, write_cr0, ept, etc.). In reality we don't see an usage requiring such optimization as the only device which imposes such non-coherency is Intel GPU which even doesn't support hotplug/hot remove. Signed-off-by: Jason Gunthorpe Reviewed-by: Kevin Tian Signed-off-by: Nicolin Chen --- drivers/vfio/vfio_iommu_type1.c | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/drivers/vfio/vfio_iommu_type1.c b/drivers/vfio/vfio_iommu_type1.c index c13b9290e357..f4e3b423a453 100644 --- a/drivers/vfio/vfio_iommu_type1.c +++ b/drivers/vfio/vfio_iommu_type1.c @@ -2285,9 +2285,7 @@ static int vfio_iommu_type1_attach_group(void *iommu_data, * testing if they're on the same bus_type. */ list_for_each_entry(d, >domain_list, next) { - if (d->domain->ops == domain->domain->ops && - d->enforce_cache_coherency == - domain->enforce_cache_coherency) { + if (d->domain->ops == domain->domain->ops) { iommu_detach_group(domain->domain, group->iommu_group); if (!iommu_attach_group(d->domain, group->iommu_group)) { Reviewed-by: Lu Baolu Best regards, baolu ___ iommu mailing list iommu@lists.linux-foundation.org https://lists.linuxfoundation.org/mailman/listinfo/iommu
Re: iommu_sva_bind_device question
On 2022/6/24 09:14, Jerry Snitselaar wrote: On Fri, Jun 24, 2022 at 08:55:08AM +0800, Baolu Lu wrote: On 2022/6/24 01:02, Jerry Snitselaar wrote: Hi Baolu & Dave, I noticed last night that on a Sapphire Rapids system if you boot without intel_iommu=on, the idxd driver will crash during probe in iommu_sva_bind_device(). Should there be a sanity check before calling dev_iommu_ops(), or is the expectation that the caller would verify it is safe to call? This seemed to be uncovered by the combination of 3f6634d997db ("iommu: Use right way to retrieve iommu_ops"), and 42a1b73852c4 ("dmaengine: idxd: Separate user and kernel pasid enabling"). [ 21.423729] BUG: kernel NULL pointer dereference, address: 0038 [ 21.445108] #PF: supervisor read access in kernel mode [ 21.450912] #PF: error_code(0x) - not-present page [ 21.456706] PGD 0 [ 21.459047] Oops: [#1] PREEMPT SMP NOPTI [ 21.464004] CPU: 0 PID: 1420 Comm: kworker/0:3 Not tainted 5.19.0-0.rc3.27.eln120.x86_64 #1 [ 21.464011] Hardware name: Intel Corporation EAGLESTREAM/EAGLESTREAM, BIOS EGSDCRB1.SYS.0067.D12.2110190954 10/19/2021 [ 21.464015] Workqueue: events work_for_cpu_fn [ 21.464030] RIP: 0010:iommu_sva_bind_device+0x1d/0xe0 [ 21.464046] Code: c3 cc 66 66 2e 0f 1f 84 00 00 00 00 00 0f 1f 44 00 00 41 57 41 56 49 89 d6 41 55 41 54 55 53 48 83 ec 08 48 8b 87 d8 02 00 00 <48> 8b 40 38 48 8b 50 10 48 83 7a 70 00 48 89 14 24 0f 84 91 00 00 [ 21.464050] RSP: 0018:ff7245d9096b7db8 EFLAGS: 00010296 [ 21.464054] RAX: RBX: ff1eadeec8a51000 RCX: [ 21.464058] RDX: ff7245d9096b7e24 RSI: RDI: ff1eadeec8a510d0 [ 21.464060] RBP: ff1eadeec8a51000 R08: b1a12300 R09: ff1eadffbfce25b4 [ 21.464062] R10: R11: 0038 R12: c09f8000 [ 21.464065] R13: ff1eadeec8a510d0 R14: ff7245d9096b7e24 R15: ff1eaddf54429000 [ 21.464067] FS: () GS:ff1eadee7f60() knlGS: [ 21.464070] CS: 0010 DS: ES: CR0: 80050033 [ 21.464072] CR2: 0038 CR3: 0008c0e10006 CR4: 00771ef0 [ 21.464074] DR0: DR1: DR2: [ 21.464076] DR3: DR6: fffe07f0 DR7: 0400 [ 21.464078] PKRU: 5554 [ 21.464079] Call Trace: [ 21.464083] [ 21.464092] idxd_pci_probe+0x259/0x1070 [idxd] [ 21.464121] local_pci_probe+0x3e/0x80 [ 21.464132] work_for_cpu_fn+0x13/0x20 [ 21.464136] process_one_work+0x1c4/0x380 [ 21.464143] worker_thread+0x1ab/0x380 [ 21.464147] ? _raw_spin_lock_irqsave+0x23/0x50 [ 21.464158] ? process_one_work+0x380/0x380 [ 21.464161] kthread+0xe6/0x110 [ 21.464168] ? kthread_complete_and_exit+0x20/0x20 [ 21.464172] ret_from_fork+0x1f/0x30 I figure either there needs to be a check in iommu_sva_bind_device, or idxd needs to check in idxd_enable_system_pasid that that idxd->pdev->dev.iommu is not null before it tries calling iommu_sva_bind_device. As documented around the iommu_sva_bind_device() interface: * iommu_dev_enable_feature(dev, IOMMU_DEV_FEAT_SVA) must be called first, to * initialize the required SVA features. idxd->pdev->dev.iommu should be checked in there. Dave, any thoughts? Best regards, baolu Duh, sorry I missed that in the comments. It calls iommu_dev_enable_feature(), but then goes into code that calls iommu_sva_bind_device whether or not iommu_dev_enable_feature() fails. You also will get the following warning if you don't have scalable mode enabled (either not enabled by default, or if enabled by default and passed intel_iommu=on,sm_off): If scalable mode is disabled, iommu_dev_enable_feature(IOMMU_SVA) will return failure, hence driver should not call iommu_sva_bind_device(). I guess below will disappear if above is fixed in the idxd driver. Best regards, baolu [ 24.645784] idxd :6a:01.0: enabling device (0144 -> 0146) [ 24.645871] idxd :6a:01.0: Unable to turn on user SVA feature. [ 24.645932] [ cut here ] [ 24.645935] WARNING: CPU: 0 PID: 422 at drivers/iommu/intel/pasid.c:253 intel_pasid_get_entry.isra.0+0xcd/0xe0 [ 24.675872] Modules linked in: intel_uncore(+) drm_ttm_helper isst_if_mbox_pci(+) idxd(+) snd i2c_i801(+) isst_if_mmio ttm isst_if_common mei fjes(+) soundcore intel_vsec i2c_ismt i2c_smbus idxd_bus ipmi_ssif acpi_ipmi ipmi_si acpi_pad acpi_power_meter pfr_telemetry pfr_update vfat fat fuse xfs crct10dif_pclmul crc32_pclmul crc32c_intel ghash_clmulni_intel igc wmi pinctrl_emmitsburg ipmi_devintf ipmi_msghandler [ 24.716612] CPU: 0 PID: 422 Comm: kworker/0:2 Not tainted 5.19.0-0.rc3.27.eln120.x86_64 #1 [ 24.716621] Hardware name: Intel Corporation EAGLESTREAM/EAGLESTREAM, BIOS EGSDCRB1.SYS.0067.D12.2110190954 10/19/2021 [ 24.716625] Workqueue: events work_for_cpu_fn [ 24.716645] RIP
Re: [PATCH v3 1/5] iommu: Return -EMEDIUMTYPE for incompatible domain and device/group
On 2022/6/24 04:00, Nicolin Chen wrote: diff --git a/drivers/iommu/mtk_iommu_v1.c b/drivers/iommu/mtk_iommu_v1.c index e1cb51b9866c..5386d889429d 100644 --- a/drivers/iommu/mtk_iommu_v1.c +++ b/drivers/iommu/mtk_iommu_v1.c @@ -304,7 +304,7 @@ static int mtk_iommu_v1_attach_device(struct iommu_domain *domain, struct device /* Only allow the domain created internally. */ mtk_mapping = data->mapping; if (mtk_mapping->domain != domain) - return 0; + return -EMEDIUMTYPE; if (!data->m4u_dom) { data->m4u_dom = dom; This change looks odd. It turns the return value from success to failure. Is it a bug? If so, it should go through a separated fix patch. Best regards, baolu ___ iommu mailing list iommu@lists.linux-foundation.org https://lists.linuxfoundation.org/mailman/listinfo/iommu
Re: iommu_sva_bind_device question
On 2022/6/24 01:02, Jerry Snitselaar wrote: Hi Baolu & Dave, I noticed last night that on a Sapphire Rapids system if you boot without intel_iommu=on, the idxd driver will crash during probe in iommu_sva_bind_device(). Should there be a sanity check before calling dev_iommu_ops(), or is the expectation that the caller would verify it is safe to call? This seemed to be uncovered by the combination of 3f6634d997db ("iommu: Use right way to retrieve iommu_ops"), and 42a1b73852c4 ("dmaengine: idxd: Separate user and kernel pasid enabling"). [ 21.423729] BUG: kernel NULL pointer dereference, address: 0038 [ 21.445108] #PF: supervisor read access in kernel mode [ 21.450912] #PF: error_code(0x) - not-present page [ 21.456706] PGD 0 [ 21.459047] Oops: [#1] PREEMPT SMP NOPTI [ 21.464004] CPU: 0 PID: 1420 Comm: kworker/0:3 Not tainted 5.19.0-0.rc3.27.eln120.x86_64 #1 [ 21.464011] Hardware name: Intel Corporation EAGLESTREAM/EAGLESTREAM, BIOS EGSDCRB1.SYS.0067.D12.2110190954 10/19/2021 [ 21.464015] Workqueue: events work_for_cpu_fn [ 21.464030] RIP: 0010:iommu_sva_bind_device+0x1d/0xe0 [ 21.464046] Code: c3 cc 66 66 2e 0f 1f 84 00 00 00 00 00 0f 1f 44 00 00 41 57 41 56 49 89 d6 41 55 41 54 55 53 48 83 ec 08 48 8b 87 d8 02 00 00 <48> 8b 40 38 48 8b 50 10 48 83 7a 70 00 48 89 14 24 0f 84 91 00 00 [ 21.464050] RSP: 0018:ff7245d9096b7db8 EFLAGS: 00010296 [ 21.464054] RAX: RBX: ff1eadeec8a51000 RCX: [ 21.464058] RDX: ff7245d9096b7e24 RSI: RDI: ff1eadeec8a510d0 [ 21.464060] RBP: ff1eadeec8a51000 R08: b1a12300 R09: ff1eadffbfce25b4 [ 21.464062] R10: R11: 0038 R12: c09f8000 [ 21.464065] R13: ff1eadeec8a510d0 R14: ff7245d9096b7e24 R15: ff1eaddf54429000 [ 21.464067] FS: () GS:ff1eadee7f60() knlGS: [ 21.464070] CS: 0010 DS: ES: CR0: 80050033 [ 21.464072] CR2: 0038 CR3: 0008c0e10006 CR4: 00771ef0 [ 21.464074] DR0: DR1: DR2: [ 21.464076] DR3: DR6: fffe07f0 DR7: 0400 [ 21.464078] PKRU: 5554 [ 21.464079] Call Trace: [ 21.464083] [ 21.464092] idxd_pci_probe+0x259/0x1070 [idxd] [ 21.464121] local_pci_probe+0x3e/0x80 [ 21.464132] work_for_cpu_fn+0x13/0x20 [ 21.464136] process_one_work+0x1c4/0x380 [ 21.464143] worker_thread+0x1ab/0x380 [ 21.464147] ? _raw_spin_lock_irqsave+0x23/0x50 [ 21.464158] ? process_one_work+0x380/0x380 [ 21.464161] kthread+0xe6/0x110 [ 21.464168] ? kthread_complete_and_exit+0x20/0x20 [ 21.464172] ret_from_fork+0x1f/0x30 I figure either there needs to be a check in iommu_sva_bind_device, or idxd needs to check in idxd_enable_system_pasid that that idxd->pdev->dev.iommu is not null before it tries calling iommu_sva_bind_device. As documented around the iommu_sva_bind_device() interface: * iommu_dev_enable_feature(dev, IOMMU_DEV_FEAT_SVA) must be called first, to * initialize the required SVA features. idxd->pdev->dev.iommu should be checked in there. Dave, any thoughts? Best regards, baolu ___ iommu mailing list iommu@lists.linux-foundation.org https://lists.linuxfoundation.org/mailman/listinfo/iommu
Re: [PATCH v3] iommu/vt-d: Make DMAR_UNITS_SUPPORTED a config setting
On 2022/6/23 10:51, Jerry Snitselaar wrote: The real problem here is that the iommu sequence ID overflows if DMAR_UNITS_SUPPORTED is not big enough. This is purely a software implementation issue, I am not sure whether user opt-in when building a kernel package could help a lot here. Is this something that could be figured out when parsing the dmar table? It looks like currently iommu_refcnt[], iommu_did[], and dmar_seq_ids[] depend on it. That's definitely a better solution. Best regards, baolu ___ iommu mailing list iommu@lists.linux-foundation.org https://lists.linuxfoundation.org/mailman/listinfo/iommu
Re: [PATCH v3] iommu/vt-d: Make DMAR_UNITS_SUPPORTED a config setting
On 2022/6/22 23:05, Jerry Snitselaar wrote: On Wed, Jun 22, 2022 at 7:52 AM Baolu Lu wrote: On 2022/6/16 02:36, Steve Wahl wrote: To support up to 64 sockets with 10 DMAR units each (640), make the value of DMAR_UNITS_SUPPORTED adjustable by a config variable, CONFIG_DMAR_UNITS_SUPPORTED, and make it's default 1024 when MAXSMP is set. If the available hardware exceeds DMAR_UNITS_SUPPORTED (previously set to MAX_IO_APICS, or 128), it causes these messages: "DMAR: Failed to allocate seq_id", "DMAR: Parse DMAR table failure.", and "x2apic: IRQ remapping doesn't support X2APIC mode x2apic disabled"; and the system fails to boot properly. Signed-off-by: Steve Wahl Reviewed-by: Kevin Tian --- Note that we could not find a reason for connecting DMAR_UNITS_SUPPORTED to MAX_IO_APICS as was done previously. Perhaps it seemed like the two would continue to match on earlier processors. There doesn't appear to be kernel code that assumes that the value of one is related to the other. v2: Make this value a config option, rather than a fixed constant. The default values should match previous configuration except in the MAXSMP case. Keeping the value at a power of two was requested by Kevin Tian. v3: Make the config option dependent upon DMAR_TABLE, as it is not used without this. drivers/iommu/intel/Kconfig | 7 +++ include/linux/dmar.h| 6 +- 2 files changed, 8 insertions(+), 5 deletions(-) diff --git a/drivers/iommu/intel/Kconfig b/drivers/iommu/intel/Kconfig index 39a06d245f12..07aaebcb581d 100644 --- a/drivers/iommu/intel/Kconfig +++ b/drivers/iommu/intel/Kconfig @@ -9,6 +9,13 @@ config DMAR_PERF config DMAR_DEBUG bool +config DMAR_UNITS_SUPPORTED + int "Number of DMA Remapping Units supported" + depends on DMAR_TABLE + default 1024 if MAXSMP + default 128 if X86_64 + default 64 With this patch applied, the IOMMU configuration looks like: [*] AMD IOMMU support AMD IOMMU Version 2 driver [*] Enable AMD IOMMU internals in DebugFS (1024) Number of DMA Remapping Units supported <<<< NEW [*] Support for Intel IOMMU using DMA Remapping Devices [*] Export Intel IOMMU internals in Debugfs [*] Support for Shared Virtual Memory with Intel IOMMU [*] Enable Intel DMA Remapping Devices by default [*] Enable Intel IOMMU scalable mode by default [*] Support for Interrupt Remapping [*] OMAP IOMMU Support [*] Export OMAP IOMMU internals in DebugFS [*] Rockchip IOMMU Support The NEW item looks confusing. It looks to be a generic configurable value though it's actually Intel DMAR specific. Any thoughts? Best regards, baolu Would moving it under INTEL_IOMMU at least have it show up below "Support for Intel IOMMU using DMA Remapping Devices"? I'm not sure it can be better than that, because IRQ_REMAP selects DMAR_TABLE, so we can't stick it in the if INTEL_IOMMU section. It's more reasonable to move it under INTEL_IOMMU, but the trouble is that this also stands even if INTEL_IOMMU is not configured. The real problem here is that the iommu sequence ID overflows if DMAR_UNITS_SUPPORTED is not big enough. This is purely a software implementation issue, I am not sure whether user opt-in when building a kernel package could help a lot here. If we can't find a better way, can we just step back? Best regards, baolu ___ iommu mailing list iommu@lists.linux-foundation.org https://lists.linuxfoundation.org/mailman/listinfo/iommu
Re: [PATCH v2 2/2] vfio: Use device_iommu_capable()
On 2022/6/22 20:04, Robin Murphy wrote: Use the new interface to check the capabilities for our device specifically. Signed-off-by: Robin Murphy --- drivers/vfio/vfio.c | 2 +- drivers/vfio/vfio_iommu_type1.c | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/vfio/vfio.c b/drivers/vfio/vfio.c index 73bab04880d0..765d68192c88 100644 --- a/drivers/vfio/vfio.c +++ b/drivers/vfio/vfio.c @@ -621,7 +621,7 @@ int vfio_register_group_dev(struct vfio_device *device) * VFIO always sets IOMMU_CACHE because we offer no way for userspace to * restore cache coherency. */ - if (!iommu_capable(device->dev->bus, IOMMU_CAP_CACHE_COHERENCY)) + if (!device_iommu_capable(device->dev, IOMMU_CAP_CACHE_COHERENCY)) return -EINVAL; return __vfio_register_dev(device, diff --git a/drivers/vfio/vfio_iommu_type1.c b/drivers/vfio/vfio_iommu_type1.c index e38b8bfde677..2107e95eb743 100644 --- a/drivers/vfio/vfio_iommu_type1.c +++ b/drivers/vfio/vfio_iommu_type1.c @@ -2247,7 +2247,7 @@ static int vfio_iommu_type1_attach_group(void *iommu_data, list_add(>next, >group_list); msi_remap = irq_domain_check_msi_remap() || - iommu_capable(iommu_api_dev->dev->bus, IOMMU_CAP_INTR_REMAP); + device_iommu_capable(iommu_api_dev->dev, IOMMU_CAP_INTR_REMAP); if (!allow_unsafe_interrupts && !msi_remap) { pr_warn("%s: No interrupt remapping support. Use the module param \"allow_unsafe_interrupts\" to enable VFIO IOMMU support on this platform\n", Reviewed-by: Lu Baolu Best regards, baolu ___ iommu mailing list iommu@lists.linux-foundation.org https://lists.linuxfoundation.org/mailman/listinfo/iommu
Re: [PATCH v2 1/2] vfio/type1: Simplify bus_type determination
On 2022/6/22 20:04, Robin Murphy wrote: Since IOMMU groups are mandatory for drivers to support, it stands to reason that any device which has been successfully be added to a group must be on a bus supported by that IOMMU driver, and therefore a domain viable for any device in the group must be viable for all devices in the group. This already has to be the case for the IOMMU API's internal default domain, for instance. Thus even if the group contains devices on different buses, that can only mean that the IOMMU driver actually supports such an odd topology, and so without loss of generality we can expect the bus type of any device in a group to be suitable for IOMMU API calls. Ideally we could remove bus->iommu_ops and all IOMMU APIs go through the dev_iommu_ops(). Replace vfio_bus_type() with a simple call to resolve an appropriate member device from which to then derive a bus type. This is also a step towards removing the vague bus-based interfaces from the IOMMU API, when we can subsequently switch to using this device directly. Furthermore, scrutiny reveals a lack of protection for the bus being removed while vfio_iommu_type1_attach_group() is using it; the reference that VFIO holds on the iommu_group ensures that data remains valid, but does not prevent the group's membership changing underfoot. Holding the vfio_device for as long as we need here also neatly solves this. Signed-off-by: Robin Murphy Reviewed-by: Lu Baolu Best regards, baolu --- After sleeping on it, I decided to type up the helper function approach to see how it looked in practice, and in doing so realised that with one more tweak it could also subsume the locking out of the common paths as well, so end up being a self-contained way for type1 to take care of its own concern, which I rather like. drivers/vfio/vfio.c | 18 +- drivers/vfio/vfio.h | 3 +++ drivers/vfio/vfio_iommu_type1.c | 30 +++--- 3 files changed, 31 insertions(+), 20 deletions(-) diff --git a/drivers/vfio/vfio.c b/drivers/vfio/vfio.c index 61e71c1154be..73bab04880d0 100644 --- a/drivers/vfio/vfio.c +++ b/drivers/vfio/vfio.c @@ -448,7 +448,7 @@ static void vfio_group_get(struct vfio_group *group) * Device objects - create, release, get, put, search */ /* Device reference always implies a group reference */ -static void vfio_device_put(struct vfio_device *device) +void vfio_device_put(struct vfio_device *device) { if (refcount_dec_and_test(>refcount)) complete(>comp); @@ -475,6 +475,22 @@ static struct vfio_device *vfio_group_get_device(struct vfio_group *group, return NULL; } +struct vfio_device *vfio_device_get_from_iommu(struct iommu_group *iommu_group) +{ + struct vfio_group *group = vfio_group_get_from_iommu(iommu_group); + struct vfio_device *device; + + mutex_lock(>device_lock); + list_for_each_entry(device, >device_list, group_next) { + if (vfio_device_try_get(device)) { + mutex_unlock(>device_lock); + return device; + } + } + mutex_unlock(>device_lock); + return NULL; +} + /* * VFIO driver API */ diff --git a/drivers/vfio/vfio.h b/drivers/vfio/vfio.h index a67130221151..e8f21e64541b 100644 --- a/drivers/vfio/vfio.h +++ b/drivers/vfio/vfio.h @@ -70,3 +70,6 @@ struct vfio_iommu_driver_ops { int vfio_register_iommu_driver(const struct vfio_iommu_driver_ops *ops); void vfio_unregister_iommu_driver(const struct vfio_iommu_driver_ops *ops); + +struct vfio_device *vfio_device_get_from_iommu(struct iommu_group *iommu_group); +void vfio_device_put(struct vfio_device *device); diff --git a/drivers/vfio/vfio_iommu_type1.c b/drivers/vfio/vfio_iommu_type1.c index c13b9290e357..e38b8bfde677 100644 --- a/drivers/vfio/vfio_iommu_type1.c +++ b/drivers/vfio/vfio_iommu_type1.c @@ -1679,18 +1679,6 @@ static int vfio_dma_do_map(struct vfio_iommu *iommu, return ret; } -static int vfio_bus_type(struct device *dev, void *data) -{ - struct bus_type **bus = data; - - if (*bus && *bus != dev->bus) - return -EINVAL; - - *bus = dev->bus; - - return 0; -} - static int vfio_iommu_replay(struct vfio_iommu *iommu, struct vfio_domain *domain) { @@ -2159,7 +2147,7 @@ static int vfio_iommu_type1_attach_group(void *iommu_data, struct vfio_iommu *iommu = iommu_data; struct vfio_iommu_group *group; struct vfio_domain *domain, *d; - struct bus_type *bus = NULL; + struct vfio_device *iommu_api_dev; bool resv_msi, msi_remap; phys_addr_t resv_msi_base = 0; struct iommu_domain_geometry *geo; @@ -2192,18 +2180,19 @@ static int vfio_iommu_type1_attach_group(void *iommu_data, goto out_unlock; } - /* Determine bus_type in order to allocate a domain */ - ret =
Re: [PATCH v3] iommu/vt-d: Make DMAR_UNITS_SUPPORTED a config setting
On 2022/6/16 02:36, Steve Wahl wrote: To support up to 64 sockets with 10 DMAR units each (640), make the value of DMAR_UNITS_SUPPORTED adjustable by a config variable, CONFIG_DMAR_UNITS_SUPPORTED, and make it's default 1024 when MAXSMP is set. If the available hardware exceeds DMAR_UNITS_SUPPORTED (previously set to MAX_IO_APICS, or 128), it causes these messages: "DMAR: Failed to allocate seq_id", "DMAR: Parse DMAR table failure.", and "x2apic: IRQ remapping doesn't support X2APIC mode x2apic disabled"; and the system fails to boot properly. Signed-off-by: Steve Wahl Reviewed-by: Kevin Tian --- Note that we could not find a reason for connecting DMAR_UNITS_SUPPORTED to MAX_IO_APICS as was done previously. Perhaps it seemed like the two would continue to match on earlier processors. There doesn't appear to be kernel code that assumes that the value of one is related to the other. v2: Make this value a config option, rather than a fixed constant. The default values should match previous configuration except in the MAXSMP case. Keeping the value at a power of two was requested by Kevin Tian. v3: Make the config option dependent upon DMAR_TABLE, as it is not used without this. drivers/iommu/intel/Kconfig | 7 +++ include/linux/dmar.h| 6 +- 2 files changed, 8 insertions(+), 5 deletions(-) diff --git a/drivers/iommu/intel/Kconfig b/drivers/iommu/intel/Kconfig index 39a06d245f12..07aaebcb581d 100644 --- a/drivers/iommu/intel/Kconfig +++ b/drivers/iommu/intel/Kconfig @@ -9,6 +9,13 @@ config DMAR_PERF config DMAR_DEBUG bool +config DMAR_UNITS_SUPPORTED + int "Number of DMA Remapping Units supported" + depends on DMAR_TABLE + default 1024 if MAXSMP + default 128 if X86_64 + default 64 With this patch applied, the IOMMU configuration looks like: [*] AMD IOMMU support AMD IOMMU Version 2 driver [*] Enable AMD IOMMU internals in DebugFS (1024) Number of DMA Remapping Units supported NEW [*] Support for Intel IOMMU using DMA Remapping Devices [*] Export Intel IOMMU internals in Debugfs [*] Support for Shared Virtual Memory with Intel IOMMU [*] Enable Intel DMA Remapping Devices by default [*] Enable Intel IOMMU scalable mode by default [*] Support for Interrupt Remapping [*] OMAP IOMMU Support [*] Export OMAP IOMMU internals in DebugFS [*] Rockchip IOMMU Support The NEW item looks confusing. It looks to be a generic configurable value though it's actually Intel DMAR specific. Any thoughts? Best regards, baolu ___ iommu mailing list iommu@lists.linux-foundation.org https://lists.linuxfoundation.org/mailman/listinfo/iommu
Re: [PATCH v2 1/1] iommu/vt-d: Fix RID2PASID setup failure
On 2022/6/22 17:09, Ethan Zhao wrote: 在 2022/6/22 12:41, Lu Baolu 写道: The IOMMU driver shares the pasid table for PCI alias devices. When the RID2PASID entry of the shared pasid table has been filled by the first device, the subsequent devices will encounter the "DMAR: Setup RID2PASID failed" failure as the pasid entry has already been marked as present. As the result, the IOMMU probing process will be aborted. This fixes it by skipping RID2PASID setting if the pasid entry has been populated. This works because the IOMMU core ensures that only the same IOMMU domain can be attached to all PCI alias devices at the same time. Therefore the subsequent devices just try to setup the RID2PASID entry with the same domain, which is negligible. This also adds domain validity checks for more confidence anyway. Fixes: ef848b7e5a6a0 ("iommu/vt-d: Setup pasid entry for RID2PASID support") Reported-by: Chenyi Qiang Cc: sta...@vger.kernel.org Signed-off-by: Lu Baolu --- drivers/iommu/intel/pasid.c | 22 -- 1 file changed, 16 insertions(+), 6 deletions(-) Change log: v2: - Add domain validity check in RID2PASID entry setup. diff --git a/drivers/iommu/intel/pasid.c b/drivers/iommu/intel/pasid.c index cb4c1d0cf25c..4f3525f3346f 100644 --- a/drivers/iommu/intel/pasid.c +++ b/drivers/iommu/intel/pasid.c @@ -575,6 +575,19 @@ static inline int pasid_enable_wpe(struct pasid_entry *pte) return 0; }; +/* + * Return true if @pasid is RID2PASID and the domain @did has already + * been setup to the @pte. Otherwise, return false. PCI alias devices + * probably share the single RID2PASID pasid entry in the shared pasid + * table. It's reasonable that those devices try to set a share domain + * in their probe paths. + */ I am thinking about the counter-part, the intel_pasid_tear_down_entry(), Multi devices share the same PASID entry, then one was detached from the domain, so the entry doesn't exist anymore, while another devices don't know about the change, and they are using the mapping, is it possible case ?shared thing, no refer-counter, am I missing something ? No. You are right. When any alias device is hot-removed from the system, the shared RID2PASID will be cleared without any notification to other devices. Hence any DMAs from those devices are blocked. We still have a lot to do for sharing pasid table among alias devices. Before we arrive there, let's remove it for now. Best regards, baolu ___ iommu mailing list iommu@lists.linux-foundation.org https://lists.linuxfoundation.org/mailman/listinfo/iommu
Re: [PATCH 3/3] iommu: Clean up release_device checks
On 2022/6/22 15:17, Robin Murphy wrote: On 2022-06-22 02:36, Baolu Lu wrote: On 2022/6/21 23:14, Robin Murphy wrote: Since .release_device is now called through per-device ops, any call which gets as far as a driver definitely*is* for that driver, for a device which has successfully passed .probe_device, so all the checks to that effect are now redundant and can be removed. In the same vein we can also skip freeing fwspecs which are now managed by core code. Does this depend on any other series? I didn't see iommu_fwspec_free() called in the core code. Or I missed anything? dev_iommu_free() cleans up param->fwspec directly (see b54240ad4943). FWIW the plan is that iommu_fwspec_free() should eventually go away - of the remaining uses after this, two are in fact similarly redundant already, since there's also a dev_iommu_free() in the probe failure path, and the other two should disappear in part 2 of fixing the bus probing mess (wherein the of_xlate step gets pulled into iommu_probe_device as well, and finally works correctly again). Yes, it is. Thanks for the explanation. Reviewed-by: Lu Baolu Best regards, baolu ___ iommu mailing list iommu@lists.linux-foundation.org https://lists.linuxfoundation.org/mailman/listinfo/iommu
Re: [PATCH 1/1] iommu/vt-d: Fix RID2PASID setup failure
On 2022/6/22 11:31, Tian, Kevin wrote: From: Baolu Lu Sent: Wednesday, June 22, 2022 11:28 AM On 2022/6/22 11:06, Tian, Kevin wrote: From: Baolu Lu Sent: Tuesday, June 21, 2022 5:04 PM On 2022/6/21 13:48, Tian, Kevin wrote: From: Baolu Lu Sent: Tuesday, June 21, 2022 12:28 PM On 2022/6/21 11:46, Tian, Kevin wrote: From: Baolu Lu Sent: Tuesday, June 21, 2022 11:39 AM On 2022/6/21 10:54, Tian, Kevin wrote: From: Lu Baolu Sent: Monday, June 20, 2022 4:17 PM @@ -2564,7 +2564,7 @@ static int domain_add_dev_info(struct dmar_domain *domain, struct device *dev) ret = intel_pasid_setup_second_level(iommu, domain, dev, PASID_RID2PASID); spin_unlock_irqrestore(>lock, flags); - if (ret) { + if (ret && ret != -EBUSY) { dev_err(dev, "Setup RID2PASID failed\n"); dmar_remove_one_dev_info(dev); return ret; -- 2.25.1 It's cleaner to avoid this error at the first place, i.e. only do the setup when the first device is attached to the pasid table. The logic that identifies the first device might introduce additional unnecessary complexity. Devices that share a pasid table are rare. I even prefer to give up sharing tables so that the code can be simpler.:-) It's not that complex if you simply move device_attach_pasid_table() out of intel_pasid_alloc_table(). Then do the setup if list_empty(_table->dev) and then attach device to the pasid table in domain_add_dev_info(). The pasid table is part of the device, hence a better place to allocate/free the pasid table is in the device probe/release paths. Things will become more complicated if we change relationship between device and it's pasid table when attaching/detaching a domain. That's the reason why I thought it was additional complexity. If you do want to follow current route it’s still cleaner to check whether the pasid entry has pointed to the domain in the individual setup function instead of blindly returning -EBUSY and then ignoring it even if a real busy condition occurs. The setup functions can just return zero for this benign alias case. Kevin, how do you like this one? diff --git a/drivers/iommu/intel/pasid.c b/drivers/iommu/intel/pasid.c index cb4c1d0cf25c..ecffd0129b2b 100644 --- a/drivers/iommu/intel/pasid.c +++ b/drivers/iommu/intel/pasid.c @@ -575,6 +575,16 @@ static inline int pasid_enable_wpe(struct pasid_entry *pte) return 0; }; +/* + * Return true if @pasid is RID2PASID and the domain @did has already + * been setup to the @pte. Otherwise, return false. + */ +static inline bool +rid2pasid_domain_valid(struct pasid_entry *pte, u32 pasid, u16 did) +{ + return pasid == PASID_RID2PASID && pasid_get_domain_id(pte) == did; +} better this is not restricted to RID2PASID only, e.g. pasid_pte_match_domain() and then read pasid from the pte to compare with the pasid argument. The pasid value is not encoded in the pasid table entry. This validity check is only for RID2PASID as alias devices share the single RID2PASID entry. For other cases, we should always return -EBUSY as what the code is doing now. You are right. Very appreciated for your input. I will update it with a v2. Best regards, baolu ___ iommu mailing list iommu@lists.linux-foundation.org https://lists.linuxfoundation.org/mailman/listinfo/iommu
Re: [PATCH 1/1] iommu/vt-d: Fix RID2PASID setup failure
On 2022/6/22 11:06, Tian, Kevin wrote: From: Baolu Lu Sent: Tuesday, June 21, 2022 5:04 PM On 2022/6/21 13:48, Tian, Kevin wrote: From: Baolu Lu Sent: Tuesday, June 21, 2022 12:28 PM On 2022/6/21 11:46, Tian, Kevin wrote: From: Baolu Lu Sent: Tuesday, June 21, 2022 11:39 AM On 2022/6/21 10:54, Tian, Kevin wrote: From: Lu Baolu Sent: Monday, June 20, 2022 4:17 PM @@ -2564,7 +2564,7 @@ static int domain_add_dev_info(struct dmar_domain *domain, struct device *dev) ret = intel_pasid_setup_second_level(iommu, domain, dev, PASID_RID2PASID); spin_unlock_irqrestore(>lock, flags); - if (ret) { + if (ret && ret != -EBUSY) { dev_err(dev, "Setup RID2PASID failed\n"); dmar_remove_one_dev_info(dev); return ret; -- 2.25.1 It's cleaner to avoid this error at the first place, i.e. only do the setup when the first device is attached to the pasid table. The logic that identifies the first device might introduce additional unnecessary complexity. Devices that share a pasid table are rare. I even prefer to give up sharing tables so that the code can be simpler.:-) It's not that complex if you simply move device_attach_pasid_table() out of intel_pasid_alloc_table(). Then do the setup if list_empty(_table->dev) and then attach device to the pasid table in domain_add_dev_info(). The pasid table is part of the device, hence a better place to allocate/free the pasid table is in the device probe/release paths. Things will become more complicated if we change relationship between device and it's pasid table when attaching/detaching a domain. That's the reason why I thought it was additional complexity. If you do want to follow current route it’s still cleaner to check whether the pasid entry has pointed to the domain in the individual setup function instead of blindly returning -EBUSY and then ignoring it even if a real busy condition occurs. The setup functions can just return zero for this benign alias case. Kevin, how do you like this one? diff --git a/drivers/iommu/intel/pasid.c b/drivers/iommu/intel/pasid.c index cb4c1d0cf25c..ecffd0129b2b 100644 --- a/drivers/iommu/intel/pasid.c +++ b/drivers/iommu/intel/pasid.c @@ -575,6 +575,16 @@ static inline int pasid_enable_wpe(struct pasid_entry *pte) return 0; }; +/* + * Return true if @pasid is RID2PASID and the domain @did has already + * been setup to the @pte. Otherwise, return false. + */ +static inline bool +rid2pasid_domain_valid(struct pasid_entry *pte, u32 pasid, u16 did) +{ + return pasid == PASID_RID2PASID && pasid_get_domain_id(pte) == did; +} better this is not restricted to RID2PASID only, e.g. pasid_pte_match_domain() and then read pasid from the pte to compare with the pasid argument. The pasid value is not encoded in the pasid table entry. This validity check is only for RID2PASID as alias devices share the single RID2PASID entry. For other cases, we should always return -EBUSY as what the code is doing now. Best regards, baolu ___ iommu mailing list iommu@lists.linux-foundation.org https://lists.linuxfoundation.org/mailman/listinfo/iommu
Re: [PATCH 1/1] iommu/vt-d: Fix RID2PASID setup failure
On 2022/6/22 10:56, Ethan Zhao wrote: 在 2022/6/20 16:17, Lu Baolu 写道: The IOMMU driver shares the pasid table for PCI alias devices. When the RID2PASID entry of the shared pasid table has been filled by the first device, the subsequent devices will encounter the "DMAR: Setup RID2PASID failed" failure as the pasid entry has already been marke as present. As the result, the IOMMU probing process will be aborted. This fixes it by skipping RID2PASID setting if the pasid entry has been populated. This works because the IOMMU core ensures that only the same IOMMU domain can be attached to all PCI alias devices at the same time. Therefore the subsequent devices just try to setup the RID2PASID entry with the same domain, which is negligible. We have two customers reported the issue "DMAR: Setup RID2PASID failed", Two ASPEED devices locate behind one PCIe-PCI bridge and iommu SM, PT mode is enabled. Most Interesting thing is the second device is only used by BIOS, and BIOS left it to OS without shutting down, and it is useless for OS. This sounds odd. Isn't this a bug? Is there practical case multi devices behind PCIe-PCI bridge share the same PASID entry without any security concern ? these two customer's case is not. The devices underneath the PCIe-PCI bridge are alias devices of the bridge. PCI alias devices always sit in the same group (the minimal unit that IOMMU guarantees isolation) and can only be attached with a same domain (managed I/O address space). Hence, there's no security concern if they further share the pasid table. Best regards, baolu ___ iommu mailing list iommu@lists.linux-foundation.org https://lists.linuxfoundation.org/mailman/listinfo/iommu
Re: [PATCH 3/3] iommu: Clean up release_device checks
On 2022/6/21 23:14, Robin Murphy wrote: Since .release_device is now called through per-device ops, any call which gets as far as a driver definitely*is* for that driver, for a device which has successfully passed .probe_device, so all the checks to that effect are now redundant and can be removed. In the same vein we can also skip freeing fwspecs which are now managed by core code. Does this depend on any other series? I didn't see iommu_fwspec_free() called in the core code. Or I missed anything? Signed-off-by: Robin Murphy --- drivers/iommu/apple-dart.c | 3 --- drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c | 8 +--- drivers/iommu/arm/arm-smmu/arm-smmu.c | 14 +++--- drivers/iommu/arm/arm-smmu/qcom_iommu.c | 11 --- drivers/iommu/exynos-iommu.c| 3 --- drivers/iommu/mtk_iommu.c | 5 - drivers/iommu/mtk_iommu_v1.c| 5 - drivers/iommu/sprd-iommu.c | 11 --- drivers/iommu/virtio-iommu.c| 8 +--- 9 files changed, 5 insertions(+), 63 deletions(-) Best regards, baolu ___ iommu mailing list iommu@lists.linux-foundation.org https://lists.linuxfoundation.org/mailman/listinfo/iommu
Re: [PATCH 2/3] iommu: Make .release_device optional
On 2022/6/21 23:14, Robin Murphy wrote: Many drivers do nothing meaningful for .release_device, and it's neatly abstracted to just two callsites in the core code, so let's make it optional to implement. Signed-off-by: Robin Murphy --- drivers/iommu/fsl_pamu_domain.c | 5 - drivers/iommu/iommu.c | 6 -- drivers/iommu/msm_iommu.c | 5 - drivers/iommu/sun50i-iommu.c| 3 --- drivers/iommu/tegra-gart.c | 5 - drivers/iommu/tegra-smmu.c | 3 --- 6 files changed, 4 insertions(+), 23 deletions(-) diff --git a/drivers/iommu/fsl_pamu_domain.c b/drivers/iommu/fsl_pamu_domain.c index 94b4589dc67c..011f9ab7f743 100644 --- a/drivers/iommu/fsl_pamu_domain.c +++ b/drivers/iommu/fsl_pamu_domain.c @@ -447,15 +447,10 @@ static struct iommu_device *fsl_pamu_probe_device(struct device *dev) return _iommu; } -static void fsl_pamu_release_device(struct device *dev) -{ -} - static const struct iommu_ops fsl_pamu_ops = { .capable= fsl_pamu_capable, .domain_alloc = fsl_pamu_domain_alloc, .probe_device = fsl_pamu_probe_device, - .release_device = fsl_pamu_release_device, .device_group = fsl_pamu_device_group, .default_domain_ops = &(const struct iommu_domain_ops) { .attach_dev = fsl_pamu_attach_device, diff --git a/drivers/iommu/iommu.c b/drivers/iommu/iommu.c index 06d6989f07f6..8b4fc7e62b99 100644 --- a/drivers/iommu/iommu.c +++ b/drivers/iommu/iommu.c @@ -259,7 +259,8 @@ static int __iommu_probe_device(struct device *dev, struct list_head *group_list return 0; out_release: - ops->release_device(dev); + if (ops->release_device) + ops->release_device(dev); out_module_put: module_put(ops->owner); @@ -337,7 +338,8 @@ void iommu_release_device(struct device *dev) iommu_device_unlink(dev->iommu->iommu_dev, dev); ops = dev_iommu_ops(dev); - ops->release_device(dev); + if (ops->release_device) + ops->release_device(dev); iommu_group_remove_device(dev); module_put(ops->owner); diff --git a/drivers/iommu/msm_iommu.c b/drivers/iommu/msm_iommu.c index f09aedfdd462..428919a474c1 100644 --- a/drivers/iommu/msm_iommu.c +++ b/drivers/iommu/msm_iommu.c @@ -394,10 +394,6 @@ static struct iommu_device *msm_iommu_probe_device(struct device *dev) return >iommu; } -static void msm_iommu_release_device(struct device *dev) -{ -} - static int msm_iommu_attach_dev(struct iommu_domain *domain, struct device *dev) { int ret = 0; @@ -677,7 +673,6 @@ irqreturn_t msm_iommu_fault_handler(int irq, void *dev_id) static struct iommu_ops msm_iommu_ops = { .domain_alloc = msm_iommu_domain_alloc, .probe_device = msm_iommu_probe_device, - .release_device = msm_iommu_release_device, .device_group = generic_device_group, .pgsize_bitmap = MSM_IOMMU_PGSIZES, .of_xlate = qcom_iommu_of_xlate, diff --git a/drivers/iommu/sun50i-iommu.c b/drivers/iommu/sun50i-iommu.c index c54ab477b8fd..a84c63518773 100644 --- a/drivers/iommu/sun50i-iommu.c +++ b/drivers/iommu/sun50i-iommu.c @@ -738,8 +738,6 @@ static struct iommu_device *sun50i_iommu_probe_device(struct device *dev) return >iommu; } -static void sun50i_iommu_release_device(struct device *dev) {} - static struct iommu_group *sun50i_iommu_device_group(struct device *dev) { struct sun50i_iommu *iommu = sun50i_iommu_from_dev(dev); @@ -764,7 +762,6 @@ static const struct iommu_ops sun50i_iommu_ops = { .domain_alloc = sun50i_iommu_domain_alloc, .of_xlate = sun50i_iommu_of_xlate, .probe_device = sun50i_iommu_probe_device, - .release_device = sun50i_iommu_release_device, .default_domain_ops = &(const struct iommu_domain_ops) { .attach_dev = sun50i_iommu_attach_device, .detach_dev = sun50i_iommu_detach_device, diff --git a/drivers/iommu/tegra-gart.c b/drivers/iommu/tegra-gart.c index a6700a40a6f8..e5ca3cf1a949 100644 --- a/drivers/iommu/tegra-gart.c +++ b/drivers/iommu/tegra-gart.c @@ -246,10 +246,6 @@ static struct iommu_device *gart_iommu_probe_device(struct device *dev) return _handle->iommu; } -static void gart_iommu_release_device(struct device *dev) -{ -} - static int gart_iommu_of_xlate(struct device *dev, struct of_phandle_args *args) { @@ -273,7 +269,6 @@ static void gart_iommu_sync(struct iommu_domain *domain, static const struct iommu_ops gart_iommu_ops = { .domain_alloc = gart_iommu_domain_alloc, .probe_device = gart_iommu_probe_device, - .release_device = gart_iommu_release_device, .device_group = generic_device_group, .pgsize_bitmap = GART_IOMMU_PGSIZES, .of_xlate = gart_iommu_of_xlate, diff --git a/drivers/iommu/tegra-smmu.c b/drivers/iommu/tegra-smmu.c index
Re: [PATCH 1/1] iommu/vt-d: Fix RID2PASID setup failure
On 2022/6/21 13:48, Tian, Kevin wrote: From: Baolu Lu Sent: Tuesday, June 21, 2022 12:28 PM On 2022/6/21 11:46, Tian, Kevin wrote: From: Baolu Lu Sent: Tuesday, June 21, 2022 11:39 AM On 2022/6/21 10:54, Tian, Kevin wrote: From: Lu Baolu Sent: Monday, June 20, 2022 4:17 PM @@ -2564,7 +2564,7 @@ static int domain_add_dev_info(struct dmar_domain *domain, struct device *dev) ret = intel_pasid_setup_second_level(iommu, domain, dev, PASID_RID2PASID); spin_unlock_irqrestore(>lock, flags); - if (ret) { + if (ret && ret != -EBUSY) { dev_err(dev, "Setup RID2PASID failed\n"); dmar_remove_one_dev_info(dev); return ret; -- 2.25.1 It's cleaner to avoid this error at the first place, i.e. only do the setup when the first device is attached to the pasid table. The logic that identifies the first device might introduce additional unnecessary complexity. Devices that share a pasid table are rare. I even prefer to give up sharing tables so that the code can be simpler.:-) It's not that complex if you simply move device_attach_pasid_table() out of intel_pasid_alloc_table(). Then do the setup if list_empty(_table->dev) and then attach device to the pasid table in domain_add_dev_info(). The pasid table is part of the device, hence a better place to allocate/free the pasid table is in the device probe/release paths. Things will become more complicated if we change relationship between device and it's pasid table when attaching/detaching a domain. That's the reason why I thought it was additional complexity. If you do want to follow current route it’s still cleaner to check whether the pasid entry has pointed to the domain in the individual setup function instead of blindly returning -EBUSY and then ignoring it even if a real busy condition occurs. The setup functions can just return zero for this benign alias case. Kevin, how do you like this one? diff --git a/drivers/iommu/intel/pasid.c b/drivers/iommu/intel/pasid.c index cb4c1d0cf25c..ecffd0129b2b 100644 --- a/drivers/iommu/intel/pasid.c +++ b/drivers/iommu/intel/pasid.c @@ -575,6 +575,16 @@ static inline int pasid_enable_wpe(struct pasid_entry *pte) return 0; }; +/* + * Return true if @pasid is RID2PASID and the domain @did has already + * been setup to the @pte. Otherwise, return false. + */ +static inline bool +rid2pasid_domain_valid(struct pasid_entry *pte, u32 pasid, u16 did) +{ + return pasid == PASID_RID2PASID && pasid_get_domain_id(pte) == did; +} + /* * Set up the scalable mode pasid table entry for first only * translation type. @@ -595,9 +605,8 @@ int intel_pasid_setup_first_level(struct intel_iommu *iommu, if (WARN_ON(!pte)) return -EINVAL; - /* Caller must ensure PASID entry is not in use. */ if (pasid_pte_is_present(pte)) - return -EBUSY; + return rid2pasid_domain_valid(pte, pasid, did) ? 0: -EBUSY; pasid_clear_entry(pte); @@ -698,9 +707,8 @@ int intel_pasid_setup_second_level(struct intel_iommu *iommu, return -ENODEV; } - /* Caller must ensure PASID entry is not in use. */ if (pasid_pte_is_present(pte)) - return -EBUSY; + return rid2pasid_domain_valid(pte, pasid, did) ? 0: -EBUSY; pasid_clear_entry(pte); pasid_set_domain_id(pte, did); @@ -738,9 +746,8 @@ int intel_pasid_setup_pass_through(struct intel_iommu *iommu, return -ENODEV; } - /* Caller must ensure PASID entry is not in use. */ if (pasid_pte_is_present(pte)) - return -EBUSY; + return rid2pasid_domain_valid(pte, pasid, did) ? 0: -EBUSY; pasid_clear_entry(pte); pasid_set_domain_id(pte, did); -- Best regards, baolu ___ iommu mailing list iommu@lists.linux-foundation.org https://lists.linuxfoundation.org/mailman/listinfo/iommu
Re: [PATCH 1/1] iommu/vt-d: Fix RID2PASID setup failure
On 2022/6/21 13:48, Tian, Kevin wrote: From: Baolu Lu Sent: Tuesday, June 21, 2022 12:28 PM On 2022/6/21 11:46, Tian, Kevin wrote: From: Baolu Lu Sent: Tuesday, June 21, 2022 11:39 AM On 2022/6/21 10:54, Tian, Kevin wrote: From: Lu Baolu Sent: Monday, June 20, 2022 4:17 PM @@ -2564,7 +2564,7 @@ static int domain_add_dev_info(struct dmar_domain *domain, struct device *dev) ret = intel_pasid_setup_second_level(iommu, domain, dev, PASID_RID2PASID); spin_unlock_irqrestore(>lock, flags); - if (ret) { + if (ret && ret != -EBUSY) { dev_err(dev, "Setup RID2PASID failed\n"); dmar_remove_one_dev_info(dev); return ret; -- 2.25.1 It's cleaner to avoid this error at the first place, i.e. only do the setup when the first device is attached to the pasid table. The logic that identifies the first device might introduce additional unnecessary complexity. Devices that share a pasid table are rare. I even prefer to give up sharing tables so that the code can be simpler.:-) It's not that complex if you simply move device_attach_pasid_table() out of intel_pasid_alloc_table(). Then do the setup if list_empty(_table->dev) and then attach device to the pasid table in domain_add_dev_info(). The pasid table is part of the device, hence a better place to allocate/free the pasid table is in the device probe/release paths. Things will become more complicated if we change relationship between device and it's pasid table when attaching/detaching a domain. That's the reason why I thought it was additional complexity. If you do want to follow current route it’s still cleaner to check whether the pasid entry has pointed to the domain in the individual setup function instead of blindly returning -EBUSY and then ignoring it even if a real busy condition occurs. The setup functions can just return zero for this benign alias case. Fair enough. Let me improve it. -- Best regards, baolu ___ iommu mailing list iommu@lists.linux-foundation.org https://lists.linuxfoundation.org/mailman/listinfo/iommu
Re: [PATCH 1/1] iommu/vt-d: Fix RID2PASID setup failure
On 2022/6/21 11:46, Tian, Kevin wrote: From: Baolu Lu Sent: Tuesday, June 21, 2022 11:39 AM On 2022/6/21 10:54, Tian, Kevin wrote: From: Lu Baolu Sent: Monday, June 20, 2022 4:17 PM @@ -2564,7 +2564,7 @@ static int domain_add_dev_info(struct dmar_domain *domain, struct device *dev) ret = intel_pasid_setup_second_level(iommu, domain, dev, PASID_RID2PASID); spin_unlock_irqrestore(>lock, flags); - if (ret) { + if (ret && ret != -EBUSY) { dev_err(dev, "Setup RID2PASID failed\n"); dmar_remove_one_dev_info(dev); return ret; -- 2.25.1 It's cleaner to avoid this error at the first place, i.e. only do the setup when the first device is attached to the pasid table. The logic that identifies the first device might introduce additional unnecessary complexity. Devices that share a pasid table are rare. I even prefer to give up sharing tables so that the code can be simpler.:-) It's not that complex if you simply move device_attach_pasid_table() out of intel_pasid_alloc_table(). Then do the setup if list_empty(_table->dev) and then attach device to the pasid table in domain_add_dev_info(). The pasid table is part of the device, hence a better place to allocate/free the pasid table is in the device probe/release paths. Things will become more complicated if we change relationship between device and it's pasid table when attaching/detaching a domain. That's the reason why I thought it was additional complexity. -- Best regards, baolu ___ iommu mailing list iommu@lists.linux-foundation.org https://lists.linuxfoundation.org/mailman/listinfo/iommu
Re: [PATCH 1/1] iommu/vt-d: Fix RID2PASID setup failure
On 2022/6/21 10:54, Tian, Kevin wrote: From: Lu Baolu Sent: Monday, June 20, 2022 4:17 PM @@ -2564,7 +2564,7 @@ static int domain_add_dev_info(struct dmar_domain *domain, struct device *dev) ret = intel_pasid_setup_second_level(iommu, domain, dev, PASID_RID2PASID); spin_unlock_irqrestore(>lock, flags); - if (ret) { + if (ret && ret != -EBUSY) { dev_err(dev, "Setup RID2PASID failed\n"); dmar_remove_one_dev_info(dev); return ret; -- 2.25.1 It's cleaner to avoid this error at the first place, i.e. only do the setup when the first device is attached to the pasid table. The logic that identifies the first device might introduce additional unnecessary complexity. Devices that share a pasid table are rare. I even prefer to give up sharing tables so that the code can be simpler.:-) -- Best regards, baolu ___ iommu mailing list iommu@lists.linux-foundation.org https://lists.linuxfoundation.org/mailman/listinfo/iommu
Re: [PATCH 1/1] iommu/vt-d: Fix RID2PASID setup failure
On 2022/6/20 16:31, Yi Liu wrote: Hi Baolu, On 2022/6/20 16:17, Lu Baolu wrote: The IOMMU driver shares the pasid table for PCI alias devices. When the RID2PASID entry of the shared pasid table has been filled by the first device, the subsequent devices will encounter the "DMAR: Setup RID2PASID failed" failure as the pasid entry has already been marke as present. As s/marke/marked/ Updated. Thanks! the result, the IOMMU probing process will be aborted. This fixes it by skipping RID2PASID setting if the pasid entry has been populated. This works because the IOMMU core ensures that only the same IOMMU domain can be attached to all PCI alias devices at the same time. nit. this sentence is a little bit to interpret. :-) I guess what you want to describe is the PCI alias devices should be attached to the same domain instead of different domain. right? Yes. also, does it apply to all domain types? e.g. the SVA domains introduced in "iommu: SVA and IOPF refactoring" No. Only about the RID2PASID. Therefore the subsequent devices just try to setup the RID2PASID entry with the same domain, which is negligible. Fixes: ef848b7e5a6a0 ("iommu/vt-d: Setup pasid entry for RID2PASID support") Reported-by: Chenyi Qiang Cc: sta...@vger.kernel.org Signed-off-by: Lu Baolu --- drivers/iommu/intel/iommu.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/iommu/intel/iommu.c b/drivers/iommu/intel/iommu.c index 44016594831d..b9966c01a2a2 100644 --- a/drivers/iommu/intel/iommu.c +++ b/drivers/iommu/intel/iommu.c @@ -2564,7 +2564,7 @@ static int domain_add_dev_info(struct dmar_domain *domain, struct device *dev) ret = intel_pasid_setup_second_level(iommu, domain, dev, PASID_RID2PASID); spin_unlock_irqrestore(>lock, flags); - if (ret) { + if (ret && ret != -EBUSY) { dev_err(dev, "Setup RID2PASID failed\n"); dmar_remove_one_dev_info(dev); return ret; -- Best regards, baolu ___ iommu mailing list iommu@lists.linux-foundation.org https://lists.linuxfoundation.org/mailman/listinfo/iommu
Re: [PATCH v8 05/11] iommu/vt-d: Add SVA domain support
On 2022/6/17 15:47, Tian, Kevin wrote: From: Lu Baolu Sent: Tuesday, June 7, 2022 9:50 AM + +static const struct iommu_domain_ops intel_svm_domain_ops = { + .set_dev_pasid = intel_svm_attach_dev_pasid, + .block_dev_pasid= intel_svm_detach_dev_pasid, + .free = intel_svm_domain_free, +}; + It's clearer to use set/block for intel callbacks. Yes. That reads clearer. -- Best regards, baolu ___ iommu mailing list iommu@lists.linux-foundation.org https://lists.linuxfoundation.org/mailman/listinfo/iommu
Re: [PATCH v8 04/11] iommu: Add sva iommu_domain support
On 2022/6/17 15:43, Tian, Kevin wrote: From: Baolu Lu Sent: Friday, June 10, 2022 3:16 PM +#define __IOMMU_DOMAIN_HOST_VA (1U << 5) /* Host CPU virtual address */ Do you mean general CPU VA? or Host CPU VA, I'm reading the latter as 2nd stage? Host CPU VA. In the near future, we will add another flag _GUEST_VA, so that the shared page table types are distiguished. How does the kernel knows that an user page table translates guest VA? IMHO I don't think the kernel should care about it except managing all the aspects related to the user page table itself... Okay. + /* * This are the possible domain-types * @@ -86,15 +89,24 @@ struct iommu_domain_geometry { #define IOMMU_DOMAIN_DMA_FQ (__IOMMU_DOMAIN_PAGING |\ __IOMMU_DOMAIN_DMA_API | \ __IOMMU_DOMAIN_DMA_FQ) +#define IOMMU_DOMAIN_SVA (__IOMMU_DOMAIN_SHARED | \ +__IOMMU_DOMAIN_HOST_VA) Doesn't shared automatically mean CPU VA? Do we need another flag? Yes. Shared means CPU VA, but there're many types. Besides above two, we also see the shared KVM/EPT. Will the two sharing scenarios share any common code? If not then having a separate flag bit is meaningless. So far, I haven't seen the need for common code. I've ever thought about the common notifier callback for page table entry update of SVA and KVM. But there has been no feasible plan. It might be more straightforward to be: #define IOMMU_DOMAIN_SVA__IOMMU_DOMAIN_SVA #define IOMMU_DOMAIN_KVM __IOMMU_DOMAIN_KVM #define IOMMU_DOMAIN_USER __IOMMU_DOMAIN_USER I am okay with this and we can add some shared bits later if we need to consolidate any code. -- Best regards, baolu ___ iommu mailing list iommu@lists.linux-foundation.org https://lists.linuxfoundation.org/mailman/listinfo/iommu
Re: [RFC PATCHES 1/2] iommu: Add RCU-protected page free support
On 2022/6/15 23:40, Jason Gunthorpe wrote: On Fri, Jun 10, 2022 at 01:37:20PM +0800, Baolu Lu wrote: On 2022/6/9 20:49, Jason Gunthorpe wrote: +void iommu_free_pgtbl_pages(struct iommu_domain *domain, + struct list_head *pages) +{ + struct page *page, *next; + + if (!domain->concurrent_traversal) { + put_pages_list(pages); + return; + } + + list_for_each_entry_safe(page, next, pages, lru) { + list_del(>lru); + call_rcu(>rcu_head, pgtble_page_free_rcu); + } It seems OK, but I wonder if there is benifit to using put_pages_list() from the rcu callback The price is that we need to allocate a "struct list_head" and free it in the rcu callback as well. Currently the list_head is sitting in the stack. You'd have to use a different struct page layout so that the list_head was in the struct page and didn't overlap with the rcu_head Okay, let me head this direction in the next version. Best regards, baolu ___ iommu mailing list iommu@lists.linux-foundation.org https://lists.linuxfoundation.org/mailman/listinfo/iommu
Re: [PATCH v2 1/5] iommu: Return -EMEDIUMTYPE for incompatible domain and device/group
On 2022/6/16 08:03, Nicolin Chen wrote: diff --git a/drivers/iommu/intel/iommu.c b/drivers/iommu/intel/iommu.c index 44016594831d..0dd13330fe12 100644 --- a/drivers/iommu/intel/iommu.c +++ b/drivers/iommu/intel/iommu.c @@ -4323,7 +4323,7 @@ static int prepare_domain_attach_device(struct iommu_domain *domain, return -ENODEV; if (dmar_domain->force_snooping && !ecap_sc_support(iommu->ecap)) - return -EOPNOTSUPP; + return -EMEDIUMTYPE; /* check if this iommu agaw is sufficient for max mapped address */ addr_width = agaw_to_width(iommu->agaw); @@ -4331,10 +4331,10 @@ static int prepare_domain_attach_device(struct iommu_domain *domain, addr_width = cap_mgaw(iommu->cap); if (dmar_domain->max_addr > (1LL << addr_width)) { - dev_err(dev, "%s: iommu width (%d) is not " + dev_dbg(dev, "%s: iommu width (%d) is not " "sufficient for the mapped address (%llx)\n", __func__, addr_width, dmar_domain->max_addr); - return -EFAULT; + return -EMEDIUMTYPE; } dmar_domain->gaw = addr_width; Can we simply remove the dev_err()? As the return value has explicitly explained the failure reason, putting a print statement won't help much. Best regards, baolu ___ iommu mailing list iommu@lists.linux-foundation.org https://lists.linuxfoundation.org/mailman/listinfo/iommu
Re: [PATCH v2 03/12] iommu/vt-d: Remove clearing translation data in disable_dmar_iommu()
On 2022/6/15 14:22, Tian, Kevin wrote: From: Baolu Lu Sent: Tuesday, June 14, 2022 3:21 PM On 2022/6/14 14:49, Tian, Kevin wrote: From: Lu Baolu Sent: Tuesday, June 14, 2022 10:51 AM The disable_dmar_iommu() is called when IOMMU initialization fails or the IOMMU is hot-removed from the system. In both cases, there is no need to clear the IOMMU translation data structures for devices. On the initialization path, the device probing only happens after the IOMMU is initialized successfully, hence there're no translation data structures. Out of curiosity. With kexec the IOMMU may contain stale mappings from the old kernel. Then is it meaningful to disable IOMMU after the new kernel fails to initialize it properly? For kexec kernel, if the IOMMU is detected to be pre-enabled, the IOMMU driver will try to copy tables from the old kernel. If copying table fails, the IOMMU driver will disable IOMMU and do the normal initialization. What about an error occurred after copying table in the initialization path? The new kernel will be in a state assuming iommu is disabled but it is still enabled using an old mapping for certain devices... If copying table failed, the translation will be disabled and a clean root table will be used. if (translation_pre_enabled(iommu)) { pr_info("Translation already enabled - trying to copy translation structures\n"); ret = copy_translation_tables(iommu); if (ret) { /* * We found the IOMMU with translation * enabled - but failed to copy over the * old root-entry table. Try to proceed * by disabling translation now and * allocating a clean root-entry table. * This might cause DMAR faults, but * probably the dump will still succeed. */ pr_err("Failed to copy translation tables from previous kernel for %s\n", iommu->name); iommu_disable_translation(iommu); clear_translation_pre_enabled(iommu); } else { pr_info("Copied translation tables from previous kernel for %s\n", iommu->name); } } Best regards, baolu ___ iommu mailing list iommu@lists.linux-foundation.org https://lists.linuxfoundation.org/mailman/listinfo/iommu
Re: [PATCH v2 01/12] iommu/vt-d: debugfs: Remove device_domain_lock usage
On 2022/6/15 14:13, Tian, Kevin wrote: From: Baolu Lu Sent: Wednesday, June 15, 2022 9:54 AM On 2022/6/14 14:43, Tian, Kevin wrote: From: Lu Baolu Sent: Tuesday, June 14, 2022 10:51 AM The domain_translation_struct debugfs node is used to dump the DMAR page tables for the PCI devices. It potentially races with setting domains to devices. The existing code uses a global spinlock device_domain_lock to avoid the races, but this is problematical as this lock is only used to protect the device tracking lists of each domain. is it really problematic at this point? Before following patches are applied using device_domain_lock should have similar effect as holding the group lock. Here it might make more sense to just focus on removing the use of device_domain_lock outside of iommu.c. Just that using group lock is cleaner and more compatible to following cleanups. and it's worth mentioning that racing with page table updates is out of the scope of this series. Probably also add a comment in the code to clarify this point. Hi Kevin, How do you like below updated patch? Yes, this is better. From cecc9a0623780a11c4ea4d0a15aa6187f01541c4 Mon Sep 17 00:00:00 2001 From: Lu Baolu Date: Sun, 29 May 2022 10:18:56 +0800 Subject: [PATCH 1/1] iommu/vt-d: debugfs: Remove device_domain_lock usage The domain_translation_struct debugfs node is used to dump the DMAR page tables for the PCI devices. It potentially races with setting domains to devices. The existing code uses the global spinlock device_domain_lock to avoid the races. This removes the use of device_domain_lock outside of iommu.c by replacing it with the group mutex lock. Using the group mutex lock is cleaner and more compatible to following cleanups. Signed-off-by: Lu Baolu --- drivers/iommu/intel/debugfs.c | 42 +-- drivers/iommu/intel/iommu.c | 2 +- drivers/iommu/intel/iommu.h | 1 - 3 files changed, 31 insertions(+), 14 deletions(-) diff --git a/drivers/iommu/intel/debugfs.c b/drivers/iommu/intel/debugfs.c index d927ef10641b..f4acd8993f60 100644 --- a/drivers/iommu/intel/debugfs.c +++ b/drivers/iommu/intel/debugfs.c @@ -342,13 +342,13 @@ static void pgtable_walk_level(struct seq_file *m, struct dma_pte *pde, } } -static int show_device_domain_translation(struct device *dev, void *data) +static int __show_device_domain_translation(struct device *dev, void *data) { - struct device_domain_info *info = dev_iommu_priv_get(dev); - struct dmar_domain *domain = info->domain; + struct dmar_domain *domain; struct seq_file *m = data; u64 path[6] = { 0 }; + domain = to_dmar_domain(iommu_get_domain_for_dev(dev)); if (!domain) return 0; @@ -359,20 +359,38 @@ static int show_device_domain_translation(struct device *dev, void *data) pgtable_walk_level(m, domain->pgd, domain->agaw + 2, 0, path); seq_putc(m, '\n'); - return 0; + return 1; } -static int domain_translation_struct_show(struct seq_file *m, void *unused) +static int show_device_domain_translation(struct device *dev, void *data) { - unsigned long flags; - int ret; + struct iommu_group *group; - spin_lock_irqsave(_domain_lock, flags); - ret = bus_for_each_dev(_bus_type, NULL, m, - show_device_domain_translation); - spin_unlock_irqrestore(_domain_lock, flags); + group = iommu_group_get(dev); + if (group) { + /* +* The group->mutex is held across the callback, which will +* block calls to iommu_attach/detach_group/device. Hence, +* the domain of the device will not change during traversal. +* +* All devices in an iommu group share a single domain, hence +* we only dump the domain of the first device. Even though, bus_for_each_dev() will still lead to duplicated dump in the same group but probably we can leave with it for a debug interface. Yes. This is what it was. Ideally we could walk the iommu groups and dump the device names belonging to the group and it's domain mappings, but I was not willing to add any helpers in the iommu core just for a debugfs use. --- Best regards, baolu ___ iommu mailing list iommu@lists.linux-foundation.org https://lists.linuxfoundation.org/mailman/listinfo/iommu
Re: [PATCH v2 01/12] iommu/vt-d: debugfs: Remove device_domain_lock usage
On 2022/6/14 14:43, Tian, Kevin wrote: From: Lu Baolu Sent: Tuesday, June 14, 2022 10:51 AM The domain_translation_struct debugfs node is used to dump the DMAR page tables for the PCI devices. It potentially races with setting domains to devices. The existing code uses a global spinlock device_domain_lock to avoid the races, but this is problematical as this lock is only used to protect the device tracking lists of each domain. is it really problematic at this point? Before following patches are applied using device_domain_lock should have similar effect as holding the group lock. Here it might make more sense to just focus on removing the use of device_domain_lock outside of iommu.c. Just that using group lock is cleaner and more compatible to following cleanups. and it's worth mentioning that racing with page table updates is out of the scope of this series. Probably also add a comment in the code to clarify this point. Hi Kevin, How do you like below updated patch? From cecc9a0623780a11c4ea4d0a15aa6187f01541c4 Mon Sep 17 00:00:00 2001 From: Lu Baolu Date: Sun, 29 May 2022 10:18:56 +0800 Subject: [PATCH 1/1] iommu/vt-d: debugfs: Remove device_domain_lock usage The domain_translation_struct debugfs node is used to dump the DMAR page tables for the PCI devices. It potentially races with setting domains to devices. The existing code uses the global spinlock device_domain_lock to avoid the races. This removes the use of device_domain_lock outside of iommu.c by replacing it with the group mutex lock. Using the group mutex lock is cleaner and more compatible to following cleanups. Signed-off-by: Lu Baolu --- drivers/iommu/intel/debugfs.c | 42 +-- drivers/iommu/intel/iommu.c | 2 +- drivers/iommu/intel/iommu.h | 1 - 3 files changed, 31 insertions(+), 14 deletions(-) diff --git a/drivers/iommu/intel/debugfs.c b/drivers/iommu/intel/debugfs.c index d927ef10641b..f4acd8993f60 100644 --- a/drivers/iommu/intel/debugfs.c +++ b/drivers/iommu/intel/debugfs.c @@ -342,13 +342,13 @@ static void pgtable_walk_level(struct seq_file *m, struct dma_pte *pde, } } -static int show_device_domain_translation(struct device *dev, void *data) +static int __show_device_domain_translation(struct device *dev, void *data) { - struct device_domain_info *info = dev_iommu_priv_get(dev); - struct dmar_domain *domain = info->domain; + struct dmar_domain *domain; struct seq_file *m = data; u64 path[6] = { 0 }; + domain = to_dmar_domain(iommu_get_domain_for_dev(dev)); if (!domain) return 0; @@ -359,20 +359,38 @@ static int show_device_domain_translation(struct device *dev, void *data) pgtable_walk_level(m, domain->pgd, domain->agaw + 2, 0, path); seq_putc(m, '\n'); - return 0; + return 1; } -static int domain_translation_struct_show(struct seq_file *m, void *unused) +static int show_device_domain_translation(struct device *dev, void *data) { - unsigned long flags; - int ret; + struct iommu_group *group; - spin_lock_irqsave(_domain_lock, flags); - ret = bus_for_each_dev(_bus_type, NULL, m, - show_device_domain_translation); - spin_unlock_irqrestore(_domain_lock, flags); + group = iommu_group_get(dev); + if (group) { + /* +* The group->mutex is held across the callback, which will +* block calls to iommu_attach/detach_group/device. Hence, +* the domain of the device will not change during traversal. +* +* All devices in an iommu group share a single domain, hence +* we only dump the domain of the first device. Even though, +* this code still possibly races with the iommu_unmap() +* interface. This could be solved by RCU-freeing the page +* table pages in the iommu_unmap() path. +*/ + iommu_group_for_each_dev(group, data, +__show_device_domain_translation); + iommu_group_put(group); + } - return ret; + return 0; +} + +static int domain_translation_struct_show(struct seq_file *m, void *unused) +{ + return bus_for_each_dev(_bus_type, NULL, m, + show_device_domain_translation); } DEFINE_SHOW_ATTRIBUTE(domain_translation_struct); diff --git a/drivers/iommu/intel/iommu.c b/drivers/iommu/intel/iommu.c index 19024dc52735..a39d72a9d1cf 100644 --- a/drivers/iommu/intel/iommu.c +++ b/drivers/iommu/intel/iommu.c @@ -314,7 +314,7 @@ static int iommu_skip_te_disable; #define IDENTMAP_GFX 2 #define IDENTMAP_AZALIA4 -DEFINE_SPINLOCK(device_domain_lock); +static DEFINE_SPINLOCK(device_domain_lock); static LIST_HEAD(device_domain_list); /* diff --git a/drivers/iommu/intel/iommu.h
Re: [PATCH v2] iommu/vt-d: Make DMAR_UNITS_SUPPORTED a config setting
On 2022/6/15 05:12, Steve Wahl wrote: On Tue, Jun 14, 2022 at 12:01:45PM -0700, Jerry Snitselaar wrote: On Tue, Jun 14, 2022 at 11:45:35AM -0500, Steve Wahl wrote: On Tue, Jun 14, 2022 at 10:21:29AM +0800, Baolu Lu wrote: On 2022/6/14 09:54, Jerry Snitselaar wrote: On Mon, Jun 13, 2022 at 6:51 PM Baolu Lu wrote: On 2022/6/14 09:44, Jerry Snitselaar wrote: On Mon, Jun 13, 2022 at 6:36 PM Baolu Lu wrote: On 2022/6/14 04:57, Jerry Snitselaar wrote: On Thu, May 12, 2022 at 10:13:09AM -0500, Steve Wahl wrote: To support up to 64 sockets with 10 DMAR units each (640), make the value of DMAR_UNITS_SUPPORTED adjustable by a config variable, CONFIG_DMAR_UNITS_SUPPORTED, and make it's default 1024 when MAXSMP is set. If the available hardware exceeds DMAR_UNITS_SUPPORTED (previously set to MAX_IO_APICS, or 128), it causes these messages: "DMAR: Failed to allocate seq_id", "DMAR: Parse DMAR table failure.", and "x2apic: IRQ remapping doesn't support X2APIC mode x2apic disabled"; and the system fails to boot properly. Signed-off-by: Steve Wahl --- Note that we could not find a reason for connecting DMAR_UNITS_SUPPORTED to MAX_IO_APICS as was done previously. Perhaps it seemed like the two would continue to match on earlier processors. There doesn't appear to be kernel code that assumes that the value of one is related to the other. v2: Make this value a config option, rather than a fixed constant. The default values should match previous configuration except in the MAXSMP case. Keeping the value at a power of two was requested by Kevin Tian. drivers/iommu/intel/Kconfig | 6 ++ include/linux/dmar.h| 6 +- 2 files changed, 7 insertions(+), 5 deletions(-) diff --git a/drivers/iommu/intel/Kconfig b/drivers/iommu/intel/Kconfig index 247d0f2d5fdf..fdbda77ac21e 100644 --- a/drivers/iommu/intel/Kconfig +++ b/drivers/iommu/intel/Kconfig @@ -9,6 +9,12 @@ config DMAR_PERF config DMAR_DEBUG bool +config DMAR_UNITS_SUPPORTED +int "Number of DMA Remapping Units supported" Also, should there be a "depends on (X86 || IA64)" here? Do you have any compilation errors or warnings? Best regards, baolu I think it is probably harmless since it doesn't get used elsewhere, but our tooling was complaining to me because DMAR_UNITS_SUPPORTED was being autogenerated into the configs for the non-x86 architectures we build (aarch64, s390x, ppcle64). We have files corresponding to the config options that it looks at, and I had one for x86 and not the others so it noticed the discrepancy. So with "depends on (X86 || IA64)", that tool doesn't complain anymore, right? Best regards, baolu Yes, with the depends it no longer happens. The dmar code only exists on X86 and IA64 arch's. Adding this depending makes sense to me. I will add it if no objections. I think that works after Baolu's patchset that makes intel-iommu.h private. I'm pretty sure it wouldn't have worked before that. No objections. Yes, I think applying it with the depends prior to Baolu's change would still run into the issue from the KTR report if someone compiled without INTEL_IOMMU enabled. This was dealing with being able to do something like: make allmodconfig ARCH=arm64 ; grep DMAR_UNITS .config and finding CONFIG_DMAR_UNITS_SUPPORTED=64. Thinking some more though, instead of the depends being on the arch would depending on DMAR_TABLE or INTEL_IOMMU be more appropriate? At least in my limited exploration, depending on INTEL_IOMMU yields compile errors, but depending upon DMAR_TABLE appears to work fine. DMAR_TABLE is used beyond INTEL_IOMMU, so depending on DMAR_TABLE seems better. Steve, do you mind posting a v3 with this fixed? Best regards, baolu ___ iommu mailing list iommu@lists.linux-foundation.org https://lists.linuxfoundation.org/mailman/listinfo/iommu
Re: [PATCH 1/1] iommu/vt-d: Add set_dev_pasid callbacks for default domain
On 2022/6/14 15:19, Tian, Kevin wrote: From: Baolu Lu Sent: Tuesday, June 14, 2022 2:13 PM On 2022/6/14 13:36, Tian, Kevin wrote: From: Baolu Lu Sent: Tuesday, June 14, 2022 12:48 PM On 2022/6/14 12:02, Tian, Kevin wrote: From: Lu Baolu Sent: Tuesday, June 14, 2022 11:44 AM This allows the upper layers to set a domain to a PASID of a device if the PASID feature is supported by the IOMMU hardware. The typical use cases are, for example, kernel DMA with PASID and hardware assisted mediated device drivers. why is it not part of the series for those use cases? There is no consumer of added callbacks in this patch... It could be. I just wanted to maintain the integrity of Intel IOMMU driver implementation. but let's not add dead code. and this patch is actually a right step simply from set_dev_pasid() p.o.v hence you should include in any series which first tries to use that interface. Yes, that's my intention. If it reviews well, we can include it in the driver's implementation. Then you should make it clear in the first place. otherwise a patch like this implies a review for merge. Yeah! Will update this in the next version. Best regards, baolu ___ iommu mailing list iommu@lists.linux-foundation.org https://lists.linuxfoundation.org/mailman/listinfo/iommu
Re: [PATCH v2 11/12] iommu/vt-d: Use device_domain_lock accurately
On 2022/6/14 15:16, Tian, Kevin wrote: From: Lu Baolu Sent: Tuesday, June 14, 2022 10:52 AM The device_domain_lock is used to protect the device tracking list of a domain. Remove unnecessary spin_lock/unlock()'s and move the necessary ones around the list access. Signed-off-by: Lu Baolu --- drivers/iommu/intel/iommu.c | 68 +++-- 1 file changed, 27 insertions(+), 41 deletions(-) [...] +iommu_support_dev_iotlb(struct dmar_domain *domain, struct intel_iommu *iommu, + u8 bus, u8 devfn) { - struct device_domain_info *info; - - assert_spin_locked(_domain_lock); + struct device_domain_info *info = NULL, *tmp; + unsigned long flags; if (!iommu->qi) return NULL; - list_for_each_entry(info, >devices, link) - if (info->iommu == iommu && info->bus == bus && - info->devfn == devfn) { - if (info->ats_supported && info->dev) - return info; + spin_lock_irqsave(_domain_lock, flags); + list_for_each_entry(tmp, >devices, link) { + if (tmp->iommu == iommu && tmp->bus == bus && + tmp->devfn == devfn) { + if (tmp->ats_supported) + info = tmp; Directly returning with unlock here is clearer than adding another tmp variable... Sure. @@ -2460,15 +2450,14 @@ static int domain_add_dev_info(struct dmar_domain *domain, struct device *dev) if (!iommu) return -ENODEV; - spin_lock_irqsave(_domain_lock, flags); - info->domain = domain; ret = domain_attach_iommu(domain, iommu); - if (ret) { - spin_unlock_irqrestore(_domain_lock, flags); + if (ret) return ret; - } + + spin_lock_irqsave(_domain_lock, flags); list_add(>link, >devices); spin_unlock_irqrestore(_domain_lock, flags); + info->domain = domain; This is incorrect. You need fully initialize the object before adding it to the list. Otherwise a search right after above unlock and before assigning info->domain will get a wrong data Fair enough. Will fix it in the next version. Best regards, baolu ___ iommu mailing list iommu@lists.linux-foundation.org https://lists.linuxfoundation.org/mailman/listinfo/iommu
Re: [PATCH v2 10/12] iommu/vt-d: Fold __dmar_remove_one_dev_info() into its caller
On 2022/6/14 15:07, Tian, Kevin wrote: From: Lu Baolu Sent: Tuesday, June 14, 2022 10:52 AM Fold __dmar_remove_one_dev_info() into dmar_remove_one_dev_info() which is its only caller. Make the spin lock critical range only cover the device list change code and remove some unnecessary checks. Signed-off-by: Lu Baolu --- drivers/iommu/intel/iommu.c | 34 +- 1 file changed, 9 insertions(+), 25 deletions(-) diff --git a/drivers/iommu/intel/iommu.c b/drivers/iommu/intel/iommu.c index af22690f44c8..8345e0c0824c 100644 --- a/drivers/iommu/intel/iommu.c +++ b/drivers/iommu/intel/iommu.c @@ -295,7 +295,6 @@ static LIST_HEAD(dmar_satc_units); static int g_num_of_iommus; static void dmar_remove_one_dev_info(struct device *dev); -static void __dmar_remove_one_dev_info(struct device_domain_info *info); int dmar_disabled = !IS_ENABLED(CONFIG_INTEL_IOMMU_DEFAULT_ON); int intel_iommu_sm = IS_ENABLED(CONFIG_INTEL_IOMMU_SCALABLE_MODE_DEFAULT_ON); @@ -4141,20 +4140,14 @@ static void domain_context_clear(struct device_domain_info *info) _context_clear_one_cb, info); } -static void __dmar_remove_one_dev_info(struct device_domain_info *info) +static void dmar_remove_one_dev_info(struct device *dev) { - struct dmar_domain *domain; - struct intel_iommu *iommu; - - assert_spin_locked(_domain_lock); - - if (WARN_ON(!info)) - return; - - iommu = info->iommu; - domain = info->domain; + struct device_domain_info *info = dev_iommu_priv_get(dev); + struct dmar_domain *domain = info->domain; this local variable is not required as there is just one reference to info->domain. Yes. It could be removed and use info->domain directly. btw I didn't see info->domain is cleared in this path. Is it correct? It's better to clear here. I will make this change in my in-process blocking domain series. But it doesn't cause any real problems because the Intel IOMMU driver supports default domain, hence the logic here is info->domain is replaced, but not cleared. Best regards, baolu ___ iommu mailing list iommu@lists.linux-foundation.org https://lists.linuxfoundation.org/mailman/listinfo/iommu
Re: [PATCH v2 06/12] iommu/vt-d: Acquiring lock in domain ID allocation helpers
On 2022/6/14 14:52, Tian, Kevin wrote: From: Lu Baolu Sent: Tuesday, June 14, 2022 10:52 AM The iommu->lock is used to protect the per-IOMMU domain ID resource. Moving the lock into the ID alloc/free helpers makes the code more compact. At the same time, the device_domain_lock is irrelevant to the domain ID resource, remove its assertion as well. On the other hand, the iommu->lock is never used in interrupt context, there's no need to use the irqsave variant of the spinlock calls. I still prefer to separating reduction of lock ranges from changing irqsave. Locking is tricky. From bisect p.o.v. it will be a lot easier if we just change one logic in one patch. Fair enough. I will do this in the next version. Best regards, baolu ___ iommu mailing list iommu@lists.linux-foundation.org https://lists.linuxfoundation.org/mailman/listinfo/iommu
Re: [PATCH v2 03/12] iommu/vt-d: Remove clearing translation data in disable_dmar_iommu()
On 2022/6/14 14:49, Tian, Kevin wrote: From: Lu Baolu Sent: Tuesday, June 14, 2022 10:51 AM The disable_dmar_iommu() is called when IOMMU initialization fails or the IOMMU is hot-removed from the system. In both cases, there is no need to clear the IOMMU translation data structures for devices. On the initialization path, the device probing only happens after the IOMMU is initialized successfully, hence there're no translation data structures. Out of curiosity. With kexec the IOMMU may contain stale mappings from the old kernel. Then is it meaningful to disable IOMMU after the new kernel fails to initialize it properly? For kexec kernel, if the IOMMU is detected to be pre-enabled, the IOMMU driver will try to copy tables from the old kernel. If copying table fails, the IOMMU driver will disable IOMMU and do the normal initialization. Best regards, baolu ___ iommu mailing list iommu@lists.linux-foundation.org https://lists.linuxfoundation.org/mailman/listinfo/iommu
Re: [PATCH v2 01/12] iommu/vt-d: debugfs: Remove device_domain_lock usage
Hi Kevin, Thanks for your reviewing. On 2022/6/14 14:43, Tian, Kevin wrote: From: Lu Baolu Sent: Tuesday, June 14, 2022 10:51 AM The domain_translation_struct debugfs node is used to dump the DMAR page tables for the PCI devices. It potentially races with setting domains to devices. The existing code uses a global spinlock device_domain_lock to avoid the races, but this is problematical as this lock is only used to protect the device tracking lists of each domain. is it really problematic at this point? Before following patches are applied using device_domain_lock should have similar effect as holding the group lock. The device_domain_lock only protects the device tracking list of the domain, it doesn't include the domain pointer stored in the dev_info structure. That's really protected by the group->mutex. Here it might make more sense to just focus on removing the use of device_domain_lock outside of iommu.c. Just that using group lock is cleaner and more compatible to following cleanups. Fair enough. I will update the commit message with above statement. and it's worth mentioning that racing with page table updates is out of the scope of this series. Probably also add a comment in the code to clarify this point. Sure. This replaces device_domain_lock with group->mutex to protect page tables from setting a new domain. This also makes device_domain_lock static as it is now only used inside the file. s/the file/iommu.c/ Sure. Signed-off-by: Lu Baolu --- drivers/iommu/intel/iommu.h | 1 - drivers/iommu/intel/debugfs.c | 49 +-- drivers/iommu/intel/iommu.c | 2 +- 3 files changed, 36 insertions(+), 16 deletions(-) diff --git a/drivers/iommu/intel/iommu.h b/drivers/iommu/intel/iommu.h index a22adfbdf870..8a6d64d726c0 100644 --- a/drivers/iommu/intel/iommu.h +++ b/drivers/iommu/intel/iommu.h @@ -480,7 +480,6 @@ enum { #define VTD_FLAG_SVM_CAPABLE (1 << 2) extern int intel_iommu_sm; -extern spinlock_t device_domain_lock; #define sm_supported(iommu) (intel_iommu_sm && ecap_smts((iommu)->ecap)) #define pasid_supported(iommu)(sm_supported(iommu) && \ diff --git a/drivers/iommu/intel/debugfs.c b/drivers/iommu/intel/debugfs.c index d927ef10641b..5ebfe32265d5 100644 --- a/drivers/iommu/intel/debugfs.c +++ b/drivers/iommu/intel/debugfs.c @@ -342,15 +342,23 @@ static void pgtable_walk_level(struct seq_file *m, struct dma_pte *pde, } } -static int show_device_domain_translation(struct device *dev, void *data) +struct show_domain_opaque { + struct device *dev; + struct seq_file *m; +}; Sounds redundant as both bus_for_each_dev() and iommu_group_for_each_dev() declares the same fn type which accepts a device pointer and void *data. + +static int __show_device_domain_translation(struct device *dev, void *data) { - struct device_domain_info *info = dev_iommu_priv_get(dev); - struct dmar_domain *domain = info->domain; - struct seq_file *m = data; + struct show_domain_opaque *opaque = data; + struct device_domain_info *info; + struct seq_file *m = opaque->m; + struct dmar_domain *domain; u64 path[6] = { 0 }; - if (!domain) + if (dev != opaque->dev) return 0; not required. Together with above comment. The iommu group might have other devices. I only want to dump the domain of the secific @opaque->dev. It reads a bit confusing, but it's the only helper I can use outside of drivers/iommu/iommu.c. Or, since all devices in the iommu group share the same domain, hence only dump once? + info = dev_iommu_priv_get(dev); + domain = info->domain; seq_printf(m, "Device %s @0x%llx\n", dev_name(dev), (u64)virt_to_phys(domain->pgd)); @@ -359,20 +367,33 @@ static int show_device_domain_translation(struct device *dev, void *data) pgtable_walk_level(m, domain->pgd, domain->agaw + 2, 0, path); seq_putc(m, '\n'); - return 0; + return 1; } -static int domain_translation_struct_show(struct seq_file *m, void *unused) +static int show_device_domain_translation(struct device *dev, void *data) { - unsigned long flags; - int ret; + struct show_domain_opaque opaque = {dev, data}; + struct iommu_group *group; - spin_lock_irqsave(_domain_lock, flags); - ret = bus_for_each_dev(_bus_type, NULL, m, - show_device_domain_translation); - spin_unlock_irqrestore(_domain_lock, flags); + group = iommu_group_get(dev); + if (group) { + /* +* The group->mutex is held across the callback, which will +* block calls to iommu_attach/detach_group/device. Hence, +* the domain of the device will not change during traversal. +*/ + iommu_group_for_each_dev(group, , +
Re: [PATCH 1/1] iommu/vt-d: Add set_dev_pasid callbacks for default domain
On 2022/6/14 13:36, Tian, Kevin wrote: From: Baolu Lu Sent: Tuesday, June 14, 2022 12:48 PM On 2022/6/14 12:02, Tian, Kevin wrote: From: Lu Baolu Sent: Tuesday, June 14, 2022 11:44 AM This allows the upper layers to set a domain to a PASID of a device if the PASID feature is supported by the IOMMU hardware. The typical use cases are, for example, kernel DMA with PASID and hardware assisted mediated device drivers. why is it not part of the series for those use cases? There is no consumer of added callbacks in this patch... It could be. I just wanted to maintain the integrity of Intel IOMMU driver implementation. but let's not add dead code. and this patch is actually a right step simply from set_dev_pasid() p.o.v hence you should include in any series which first tries to use that interface. Yes, that's my intention. If it reviews well, we can include it in the driver's implementation. Best regards, baolu ___ iommu mailing list iommu@lists.linux-foundation.org https://lists.linuxfoundation.org/mailman/listinfo/iommu
Re: [PATCH 1/1] iommu/vt-d: Add set_dev_pasid callbacks for default domain
On 2022/6/14 12:02, Tian, Kevin wrote: From: Lu Baolu Sent: Tuesday, June 14, 2022 11:44 AM This allows the upper layers to set a domain to a PASID of a device if the PASID feature is supported by the IOMMU hardware. The typical use cases are, for example, kernel DMA with PASID and hardware assisted mediated device drivers. why is it not part of the series for those use cases? There is no consumer of added callbacks in this patch... It could be. I just wanted to maintain the integrity of Intel IOMMU driver implementation. +/* PCI domain-subdevice relationship */ +struct subdev_domain_info { + struct list_head link_domain; /* link to domain siblings */ + struct device *dev; /* physical device derived from */ + ioasid_t pasid; /* PASID on physical device */ +}; + It's not subdev. Just dev+pasid in iommu's context. How about struct device_pasid_info? Best regards, baolu ___ iommu mailing list iommu@lists.linux-foundation.org https://lists.linuxfoundation.org/mailman/listinfo/iommu
Re: [PATCH v2] iommu/vt-d: Make DMAR_UNITS_SUPPORTED a config setting
On 2022/6/14 09:54, Jerry Snitselaar wrote: On Mon, Jun 13, 2022 at 6:51 PM Baolu Lu wrote: On 2022/6/14 09:44, Jerry Snitselaar wrote: On Mon, Jun 13, 2022 at 6:36 PM Baolu Lu wrote: On 2022/6/14 04:57, Jerry Snitselaar wrote: On Thu, May 12, 2022 at 10:13:09AM -0500, Steve Wahl wrote: To support up to 64 sockets with 10 DMAR units each (640), make the value of DMAR_UNITS_SUPPORTED adjustable by a config variable, CONFIG_DMAR_UNITS_SUPPORTED, and make it's default 1024 when MAXSMP is set. If the available hardware exceeds DMAR_UNITS_SUPPORTED (previously set to MAX_IO_APICS, or 128), it causes these messages: "DMAR: Failed to allocate seq_id", "DMAR: Parse DMAR table failure.", and "x2apic: IRQ remapping doesn't support X2APIC mode x2apic disabled"; and the system fails to boot properly. Signed-off-by: Steve Wahl --- Note that we could not find a reason for connecting DMAR_UNITS_SUPPORTED to MAX_IO_APICS as was done previously. Perhaps it seemed like the two would continue to match on earlier processors. There doesn't appear to be kernel code that assumes that the value of one is related to the other. v2: Make this value a config option, rather than a fixed constant. The default values should match previous configuration except in the MAXSMP case. Keeping the value at a power of two was requested by Kevin Tian. drivers/iommu/intel/Kconfig | 6 ++ include/linux/dmar.h| 6 +- 2 files changed, 7 insertions(+), 5 deletions(-) diff --git a/drivers/iommu/intel/Kconfig b/drivers/iommu/intel/Kconfig index 247d0f2d5fdf..fdbda77ac21e 100644 --- a/drivers/iommu/intel/Kconfig +++ b/drivers/iommu/intel/Kconfig @@ -9,6 +9,12 @@ config DMAR_PERF config DMAR_DEBUG bool +config DMAR_UNITS_SUPPORTED +int "Number of DMA Remapping Units supported" Also, should there be a "depends on (X86 || IA64)" here? Do you have any compilation errors or warnings? Best regards, baolu I think it is probably harmless since it doesn't get used elsewhere, but our tooling was complaining to me because DMAR_UNITS_SUPPORTED was being autogenerated into the configs for the non-x86 architectures we build (aarch64, s390x, ppcle64). We have files corresponding to the config options that it looks at, and I had one for x86 and not the others so it noticed the discrepancy. So with "depends on (X86 || IA64)", that tool doesn't complain anymore, right? Best regards, baolu Yes, with the depends it no longer happens. The dmar code only exists on X86 and IA64 arch's. Adding this depending makes sense to me. I will add it if no objections. Best regards, baolu ___ iommu mailing list iommu@lists.linux-foundation.org https://lists.linuxfoundation.org/mailman/listinfo/iommu
Re: [PATCH v2] iommu/vt-d: Make DMAR_UNITS_SUPPORTED a config setting
On 2022/6/14 09:44, Jerry Snitselaar wrote: On Mon, Jun 13, 2022 at 6:36 PM Baolu Lu wrote: On 2022/6/14 04:57, Jerry Snitselaar wrote: On Thu, May 12, 2022 at 10:13:09AM -0500, Steve Wahl wrote: To support up to 64 sockets with 10 DMAR units each (640), make the value of DMAR_UNITS_SUPPORTED adjustable by a config variable, CONFIG_DMAR_UNITS_SUPPORTED, and make it's default 1024 when MAXSMP is set. If the available hardware exceeds DMAR_UNITS_SUPPORTED (previously set to MAX_IO_APICS, or 128), it causes these messages: "DMAR: Failed to allocate seq_id", "DMAR: Parse DMAR table failure.", and "x2apic: IRQ remapping doesn't support X2APIC mode x2apic disabled"; and the system fails to boot properly. Signed-off-by: Steve Wahl --- Note that we could not find a reason for connecting DMAR_UNITS_SUPPORTED to MAX_IO_APICS as was done previously. Perhaps it seemed like the two would continue to match on earlier processors. There doesn't appear to be kernel code that assumes that the value of one is related to the other. v2: Make this value a config option, rather than a fixed constant. The default values should match previous configuration except in the MAXSMP case. Keeping the value at a power of two was requested by Kevin Tian. drivers/iommu/intel/Kconfig | 6 ++ include/linux/dmar.h| 6 +- 2 files changed, 7 insertions(+), 5 deletions(-) diff --git a/drivers/iommu/intel/Kconfig b/drivers/iommu/intel/Kconfig index 247d0f2d5fdf..fdbda77ac21e 100644 --- a/drivers/iommu/intel/Kconfig +++ b/drivers/iommu/intel/Kconfig @@ -9,6 +9,12 @@ config DMAR_PERF config DMAR_DEBUG bool +config DMAR_UNITS_SUPPORTED +int "Number of DMA Remapping Units supported" Also, should there be a "depends on (X86 || IA64)" here? Do you have any compilation errors or warnings? Best regards, baolu I think it is probably harmless since it doesn't get used elsewhere, but our tooling was complaining to me because DMAR_UNITS_SUPPORTED was being autogenerated into the configs for the non-x86 architectures we build (aarch64, s390x, ppcle64). We have files corresponding to the config options that it looks at, and I had one for x86 and not the others so it noticed the discrepancy. So with "depends on (X86 || IA64)", that tool doesn't complain anymore, right? Best regards, baolu ___ iommu mailing list iommu@lists.linux-foundation.org https://lists.linuxfoundation.org/mailman/listinfo/iommu
Re: [PATCH v2] iommu/vt-d: Make DMAR_UNITS_SUPPORTED a config setting
On 2022/6/14 04:57, Jerry Snitselaar wrote: On Thu, May 12, 2022 at 10:13:09AM -0500, Steve Wahl wrote: To support up to 64 sockets with 10 DMAR units each (640), make the value of DMAR_UNITS_SUPPORTED adjustable by a config variable, CONFIG_DMAR_UNITS_SUPPORTED, and make it's default 1024 when MAXSMP is set. If the available hardware exceeds DMAR_UNITS_SUPPORTED (previously set to MAX_IO_APICS, or 128), it causes these messages: "DMAR: Failed to allocate seq_id", "DMAR: Parse DMAR table failure.", and "x2apic: IRQ remapping doesn't support X2APIC mode x2apic disabled"; and the system fails to boot properly. Signed-off-by: Steve Wahl --- Note that we could not find a reason for connecting DMAR_UNITS_SUPPORTED to MAX_IO_APICS as was done previously. Perhaps it seemed like the two would continue to match on earlier processors. There doesn't appear to be kernel code that assumes that the value of one is related to the other. v2: Make this value a config option, rather than a fixed constant. The default values should match previous configuration except in the MAXSMP case. Keeping the value at a power of two was requested by Kevin Tian. drivers/iommu/intel/Kconfig | 6 ++ include/linux/dmar.h| 6 +- 2 files changed, 7 insertions(+), 5 deletions(-) diff --git a/drivers/iommu/intel/Kconfig b/drivers/iommu/intel/Kconfig index 247d0f2d5fdf..fdbda77ac21e 100644 --- a/drivers/iommu/intel/Kconfig +++ b/drivers/iommu/intel/Kconfig @@ -9,6 +9,12 @@ config DMAR_PERF config DMAR_DEBUG bool +config DMAR_UNITS_SUPPORTED + int "Number of DMA Remapping Units supported" Also, should there be a "depends on (X86 || IA64)" here? Do you have any compilation errors or warnings? Best regards, baolu + default 1024 if MAXSMP + default 128 if X86_64 + default 64 + config INTEL_IOMMU bool "Support for Intel IOMMU using DMA Remapping Devices" depends on PCI_MSI && ACPI && (X86 || IA64) diff --git a/include/linux/dmar.h b/include/linux/dmar.h index 45e903d84733..0c03c1845c23 100644 --- a/include/linux/dmar.h +++ b/include/linux/dmar.h @@ -18,11 +18,7 @@ struct acpi_dmar_header; -#ifdef CONFIG_X86 -# define DMAR_UNITS_SUPPORTEDMAX_IO_APICS -#else -# define DMAR_UNITS_SUPPORTED64 -#endif +#defineDMAR_UNITS_SUPPORTEDCONFIG_DMAR_UNITS_SUPPORTED /* DMAR Flags */ #define DMAR_INTR_REMAP 0x1 -- 2.26.2 ___ iommu mailing list iommu@lists.linux-foundation.org https://lists.linuxfoundation.org/mailman/listinfo/iommu
Re: [PATCH v2] iommu/vt-d: Make DMAR_UNITS_SUPPORTED a config setting
On 2022/6/14 04:38, Jerry Snitselaar wrote: On Thu, May 12, 2022 at 10:13:09AM -0500, Steve Wahl wrote: To support up to 64 sockets with 10 DMAR units each (640), make the value of DMAR_UNITS_SUPPORTED adjustable by a config variable, CONFIG_DMAR_UNITS_SUPPORTED, and make it's default 1024 when MAXSMP is set. If the available hardware exceeds DMAR_UNITS_SUPPORTED (previously set to MAX_IO_APICS, or 128), it causes these messages: "DMAR: Failed to allocate seq_id", "DMAR: Parse DMAR table failure.", and "x2apic: IRQ remapping doesn't support X2APIC mode x2apic disabled"; and the system fails to boot properly. Signed-off-by: Steve Wahl --- Note that we could not find a reason for connecting DMAR_UNITS_SUPPORTED to MAX_IO_APICS as was done previously. Perhaps it seemed like the two would continue to match on earlier processors. There doesn't appear to be kernel code that assumes that the value of one is related to the other. v2: Make this value a config option, rather than a fixed constant. The default values should match previous configuration except in the MAXSMP case. Keeping the value at a power of two was requested by Kevin Tian. drivers/iommu/intel/Kconfig | 6 ++ include/linux/dmar.h| 6 +- 2 files changed, 7 insertions(+), 5 deletions(-) Baolu do you have this queued up for v5.20? Also do you have a public repo where you keep the vt-d changes before sending Joerg the patches for a release? Yes. I have started to queue patches for v5.20. They could be found on github: https://github.com/LuBaolu/intel-iommu/commits/vtd-next-for-v5.20 Best regards, baolu ___ iommu mailing list iommu@lists.linux-foundation.org https://lists.linuxfoundation.org/mailman/listinfo/iommu
Re: [PATCH v8 02/11] iommu: Add max_pasids field in struct dev_iommu
On 2022/6/10 17:01, Tian, Kevin wrote: From: Baolu Lu Sent: Friday, June 10, 2022 2:47 PM On 2022/6/10 03:01, Raj, Ashok wrote: On Tue, Jun 07, 2022 at 09:49:33AM +0800, Lu Baolu wrote: @@ -218,6 +219,30 @@ static void dev_iommu_free(struct device *dev) kfree(param); } +static u32 dev_iommu_get_max_pasids(struct device *dev) +{ + u32 max_pasids = dev->iommu->iommu_dev->max_pasids; + u32 num_bits; + int ret; + + if (!max_pasids) + return 0; + + if (dev_is_pci(dev)) { + ret = pci_max_pasids(to_pci_dev(dev)); + if (ret < 0) + return 0; + + return min_t(u32, max_pasids, ret); Ah.. that answers my other question to consider device pasid-max. I guess if we need any enforcement of restricting devices that aren't supporting the full PASID, that will be done by some higher layer? The mm->pasid style of SVA is explicitly enabled through iommu_dev_enable_feature(IOMMU_DEV_FEAT_SVA). The IOMMU driver specific restriction might be put there? too many returns in this function, maybe setup all returns to the end of the function might be elegant? I didn't find cleanup room after a quick scan of the code. But sure, let me go through code again offline. If we do care: +static u32 dev_iommu_get_max_pasids(struct device *dev) +{ + u32 max_pasids = 0, + u32 num_bits = 0; + int ret; + + if (dev_is_pci(dev)) { + ret = pci_max_pasids(to_pci_dev(dev)); + if (ret > 0) + max_pasids = ret; + } else { + ret = device_property_read_u32(dev, "pasid-num-bits", _bits); + if (!ret) + max_pasids = 1UL << num_bits; + } + + return min_t(u32, max_pasids, dev->iommu->iommu_dev->max_pasids); +} Great! Cleaner and more compact than mine. Thank you! Best regards, baolu ___ iommu mailing list iommu@lists.linux-foundation.org https://lists.linuxfoundation.org/mailman/listinfo/iommu
Re: [PATCH v8 04/11] iommu: Add sva iommu_domain support
On 2022/6/10 04:25, Raj, Ashok wrote: Hi Baolu Hi Ashok, some minor nits. Thanks for your comments. On Tue, Jun 07, 2022 at 09:49:35AM +0800, Lu Baolu wrote: The sva iommu_domain represents a hardware pagetable that the IOMMU hardware could use for SVA translation. This adds some infrastructure to support SVA domain in the iommu common layer. It includes: - Extend the iommu_domain to support a new IOMMU_DOMAIN_SVA domain type. The IOMMU drivers that support SVA should provide the sva domain specific iommu_domain_ops. - Add a helper to allocate an SVA domain. The iommu_domain_free() is still used to free an SVA domain. - Add helpers to attach an SVA domain to a device and the reverse operation. Some buses, like PCI, route packets without considering the PASID value. Thus a DMA target address with PASID might be treated as P2P if the address falls into the MMIO BAR of other devices in the group. To make things simple, the attach/detach interfaces only apply to devices belonging to the singleton groups, and the singleton is immutable in fabric i.e. not affected by hotplug. The iommu_attach/detach_device_pasid() can be used for other purposes, such as kernel DMA with pasid, mediation device, etc. Suggested-by: Jean-Philippe Brucker Suggested-by: Jason Gunthorpe Signed-off-by: Lu Baolu Reviewed-by: Jean-Philippe Brucker --- include/linux/iommu.h | 45 - drivers/iommu/iommu.c | 93 +++ 2 files changed, 136 insertions(+), 2 deletions(-) diff --git a/include/linux/iommu.h b/include/linux/iommu.h index 3fbad42c0bf8..9173c5741447 100644 --- a/include/linux/iommu.h +++ b/include/linux/iommu.h @@ -64,6 +64,9 @@ struct iommu_domain_geometry { #define __IOMMU_DOMAIN_PT (1U << 2) /* Domain is identity mapped */ #define __IOMMU_DOMAIN_DMA_FQ (1U << 3) /* DMA-API uses flush queue*/ +#define __IOMMU_DOMAIN_SHARED (1U << 4) /* Page table shared from CPU */ s/from CPU/with CPU Sure. +#define __IOMMU_DOMAIN_HOST_VA (1U << 5) /* Host CPU virtual address */ Do you mean general CPU VA? or Host CPU VA, I'm reading the latter as 2nd stage? Host CPU VA. In the near future, we will add another flag _GUEST_VA, so that the shared page table types are distiguished. + /* * This are the possible domain-types * @@ -86,15 +89,24 @@ struct iommu_domain_geometry { #define IOMMU_DOMAIN_DMA_FQ (__IOMMU_DOMAIN_PAGING |\ __IOMMU_DOMAIN_DMA_API | \ __IOMMU_DOMAIN_DMA_FQ) +#define IOMMU_DOMAIN_SVA (__IOMMU_DOMAIN_SHARED |\ +__IOMMU_DOMAIN_HOST_VA) Doesn't shared automatically mean CPU VA? Do we need another flag? Yes. Shared means CPU VA, but there're many types. Besides above two, we also see the shared KVM/EPT. struct iommu_domain { unsigned type; const struct iommu_domain_ops *ops; unsigned long pgsize_bitmap;/* Bitmap of page sizes in use */ - iommu_fault_handler_t handler; - void *handler_token; struct iommu_domain_geometry geometry; struct iommu_dma_cookie *iova_cookie; + union { + struct {/* IOMMU_DOMAIN_DMA */ + iommu_fault_handler_t handler; + void *handler_token; + }; + struct {/* IOMMU_DOMAIN_SVA */ + struct mm_struct *mm; + }; + }; }; static inline bool iommu_is_dma_domain(struct iommu_domain *domain) @@ -262,6 +274,8 @@ struct iommu_ops { * struct iommu_domain_ops - domain specific operations * @attach_dev: attach an iommu domain to a device * @detach_dev: detach an iommu domain from a device + * @set_dev_pasid: set an iommu domain to a pasid of device + * @block_dev_pasid: block pasid of device from using iommu domain * @map: map a physically contiguous memory region to an iommu domain * @map_pages: map a physically contiguous set of pages of the same size to * an iommu domain. @@ -282,6 +296,10 @@ struct iommu_ops { struct iommu_domain_ops { int (*attach_dev)(struct iommu_domain *domain, struct device *dev); void (*detach_dev)(struct iommu_domain *domain, struct device *dev); + int (*set_dev_pasid)(struct iommu_domain *domain, struct device *dev, +ioasid_t pasid); + void (*block_dev_pasid)(struct iommu_domain *domain, struct device *dev, + ioasid_t pasid); int (*map)(struct iommu_domain *domain, unsigned long iova, phys_addr_t paddr, size_t size, int prot, gfp_t gfp); @@ -679,6 +697,12 @@ int iommu_group_claim_dma_owner(struct iommu_group *group, void *owner); void iommu_group_release_dma_owner(struct iommu_group *group); bool iommu_group_dma_owner_claimed(struct iommu_group *group); +struct
Re: [PATCH v8 02/11] iommu: Add max_pasids field in struct dev_iommu
On 2022/6/10 03:01, Raj, Ashok wrote: On Tue, Jun 07, 2022 at 09:49:33AM +0800, Lu Baolu wrote: Use this field to save the number of PASIDs that a device is able to consume. It is a generic attribute of a device and lifting it into the per-device dev_iommu struct could help to avoid the boilerplate code in various IOMMU drivers. Signed-off-by: Lu Baolu --- include/linux/iommu.h | 2 ++ drivers/iommu/iommu.c | 26 ++ 2 files changed, 28 insertions(+) diff --git a/include/linux/iommu.h b/include/linux/iommu.h index 03fbb1b71536..d50afb2c9a09 100644 --- a/include/linux/iommu.h +++ b/include/linux/iommu.h @@ -364,6 +364,7 @@ struct iommu_fault_param { * @fwspec:IOMMU fwspec data * @iommu_dev: IOMMU device this device is linked to * @priv: IOMMU Driver private data + * @max_pasids: number of PASIDs device can consume * * TODO: migrate other per device data pointers under iommu_dev_data, e.g. *struct iommu_group *iommu_group; @@ -375,6 +376,7 @@ struct dev_iommu { struct iommu_fwspec *fwspec; struct iommu_device *iommu_dev; void*priv; + u32 max_pasids; }; int iommu_device_register(struct iommu_device *iommu, diff --git a/drivers/iommu/iommu.c b/drivers/iommu/iommu.c index 847ad47a2dfd..adac85ccde73 100644 --- a/drivers/iommu/iommu.c +++ b/drivers/iommu/iommu.c @@ -20,6 +20,7 @@ #include #include #include +#include Is this needed for this patch? Yes. It's for pci_max_pasids(). #include #include #include @@ -218,6 +219,30 @@ static void dev_iommu_free(struct device *dev) kfree(param); } +static u32 dev_iommu_get_max_pasids(struct device *dev) +{ + u32 max_pasids = dev->iommu->iommu_dev->max_pasids; + u32 num_bits; + int ret; + + if (!max_pasids) + return 0; + + if (dev_is_pci(dev)) { + ret = pci_max_pasids(to_pci_dev(dev)); + if (ret < 0) + return 0; + + return min_t(u32, max_pasids, ret); Ah.. that answers my other question to consider device pasid-max. I guess if we need any enforcement of restricting devices that aren't supporting the full PASID, that will be done by some higher layer? The mm->pasid style of SVA is explicitly enabled through iommu_dev_enable_feature(IOMMU_DEV_FEAT_SVA). The IOMMU driver specific restriction might be put there? too many returns in this function, maybe setup all returns to the end of the function might be elegant? I didn't find cleanup room after a quick scan of the code. But sure, let me go through code again offline. + } + + ret = device_property_read_u32(dev, "pasid-num-bits", _bits); + if (ret) + return 0; + + return min_t(u32, max_pasids, 1UL << num_bits); +} + static int __iommu_probe_device(struct device *dev, struct list_head *group_list) { const struct iommu_ops *ops = dev->bus->iommu_ops; @@ -243,6 +268,7 @@ static int __iommu_probe_device(struct device *dev, struct list_head *group_list } dev->iommu->iommu_dev = iommu_dev; + dev->iommu->max_pasids = dev_iommu_get_max_pasids(dev); group = iommu_group_get_for_dev(dev); if (IS_ERR(group)) { -- 2.25.1 Best regards, Baolu ___ iommu mailing list iommu@lists.linux-foundation.org https://lists.linuxfoundation.org/mailman/listinfo/iommu
Re: [PATCH v8 01/11] iommu: Add max_pasids field in struct iommu_device
On 2022/6/10 01:25, Raj, Ashok wrote: diff --git a/include/linux/intel-iommu.h b/include/linux/intel-iommu.h index 4f29139bbfc3..e065cbe3c857 100644 --- a/include/linux/intel-iommu.h +++ b/include/linux/intel-iommu.h @@ -479,7 +479,6 @@ enum { #define VTD_FLAG_IRQ_REMAP_PRE_ENABLED(1 << 1) #define VTD_FLAG_SVM_CAPABLE (1 << 2) -extern int intel_iommu_sm; extern spinlock_t device_domain_lock; #define sm_supported(iommu) (intel_iommu_sm && ecap_smts((iommu)->ecap)) @@ -786,6 +785,7 @@ struct context_entry *iommu_context_addr(struct intel_iommu *iommu, u8 bus, extern const struct iommu_ops intel_iommu_ops; #ifdef CONFIG_INTEL_IOMMU +extern int intel_iommu_sm; extern int iommu_calculate_agaw(struct intel_iommu *iommu); extern int iommu_calculate_max_sagaw(struct intel_iommu *iommu); extern int dmar_disabled; @@ -802,6 +802,7 @@ static inline int iommu_calculate_max_sagaw(struct intel_iommu *iommu) } #define dmar_disabled (1) #define intel_iommu_enabled (0) +#define intel_iommu_sm (0) Is the above part of this patch? Or should be moved up somewhere? This is to make pasid_supported() usable in dmar.c. It's only needed by the change in this patch. I should make this clear in the commit message. :-) Best regards, baolu ___ iommu mailing list iommu@lists.linux-foundation.org https://lists.linuxfoundation.org/mailman/listinfo/iommu
Re: [RFC PATCHES 1/2] iommu: Add RCU-protected page free support
On 2022/6/10 01:06, Raj, Ashok wrote: On Thu, Jun 09, 2022 at 03:08:10PM +0800, Lu Baolu wrote: The IOMMU page tables are updated using iommu_map/unmap() interfaces. Currently, there is no mandatory requirement for drivers to use locks to ensure concurrent updates to page tables, because it's assumed that overlapping IOVA ranges do not have concurrent updates. Therefore the IOMMU drivers only need to take care of concurrent updates to level page table entries. The last part doesn't read well.. s/updates to level page table entries/ updates to page-table entries at the same level But enabling new features challenges this assumption. For example, the hardware assisted dirty page tracking feature requires scanning page tables in interfaces other than mapping and unmapping. This might result in a use-after-free scenario in which a level page table has been freed by the unmap() interface, while another thread is scanning the next level page table. This adds RCU-protected page free support so that the pages are really freed and reused after a RCU grace period. Hence, the page tables are safe for scanning within a rcu_read_lock critical region. Considering that scanning the page table is a rare case, this also adds a domain flag and the RCU-protected page free is only used when this flat is set. s/flat/flag Above updated. Thank you! Signed-off-by: Lu Baolu --- include/linux/iommu.h | 9 + drivers/iommu/iommu.c | 23 +++ 2 files changed, 32 insertions(+) diff --git a/include/linux/iommu.h b/include/linux/iommu.h index 5e1afe169549..6f68eabb8567 100644 --- a/include/linux/iommu.h +++ b/include/linux/iommu.h @@ -95,6 +95,7 @@ struct iommu_domain { void *handler_token; struct iommu_domain_geometry geometry; struct iommu_dma_cookie *iova_cookie; + unsigned long concurrent_traversal:1; Does this need to be a bitfield? Even though you are needing just one bit now, you can probably make have maskbits? As discussed in another reply, I am about to drop the driver opt-in and wrapper the helper around the iommu_iotlb_gather. }; static inline bool iommu_is_dma_domain(struct iommu_domain *domain) @@ -657,6 +658,12 @@ static inline void dev_iommu_priv_set(struct device *dev, void *priv) dev->iommu->priv = priv; } +static inline void domain_set_concurrent_traversal(struct iommu_domain *domain, + bool value) +{ + domain->concurrent_traversal = value; +} + int iommu_probe_device(struct device *dev); void iommu_release_device(struct device *dev); @@ -677,6 +684,8 @@ int iommu_group_claim_dma_owner(struct iommu_group *group, void *owner); void iommu_group_release_dma_owner(struct iommu_group *group); bool iommu_group_dma_owner_claimed(struct iommu_group *group); +void iommu_free_pgtbl_pages(struct iommu_domain *domain, + struct list_head *pages); #else /* CONFIG_IOMMU_API */ struct iommu_ops {}; diff --git a/drivers/iommu/iommu.c b/drivers/iommu/iommu.c index 847ad47a2dfd..ceeb97ebe3e2 100644 --- a/drivers/iommu/iommu.c +++ b/drivers/iommu/iommu.c @@ -3252,3 +3252,26 @@ bool iommu_group_dma_owner_claimed(struct iommu_group *group) return user; } EXPORT_SYMBOL_GPL(iommu_group_dma_owner_claimed); + +static void pgtble_page_free_rcu(struct rcu_head *rcu) maybe the names can be consistent? pgtble_ vs pgtbl below. vote to drop the 'e' :-) Updated. +{ + struct page *page = container_of(rcu, struct page, rcu_head); + + __free_pages(page, 0); +} + +void iommu_free_pgtbl_pages(struct iommu_domain *domain, + struct list_head *pages) +{ + struct page *page, *next; + + if (!domain->concurrent_traversal) { + put_pages_list(pages); + return; + } + + list_for_each_entry_safe(page, next, pages, lru) { + list_del(>lru); + call_rcu(>rcu_head, pgtble_page_free_rcu); + } +} -- 2.25.1 Best regards, baolu ___ iommu mailing list iommu@lists.linux-foundation.org https://lists.linuxfoundation.org/mailman/listinfo/iommu
Re: [RFC PATCHES 1/2] iommu: Add RCU-protected page free support
On 2022/6/9 21:32, Jason Gunthorpe wrote: On Thu, Jun 09, 2022 at 02:19:06PM +0100, Robin Murphy wrote: Is there a significant benefit to keeping both paths, or could we get away with just always using RCU? Realistically, pagetable pages aren't likely to be freed all that frequently, except perhaps at domain teardown, but that shouldn't really be performance-critical, and I guess we could stick an RCU sync point in iommu_domain_free() if we're really worried about releasing larger quantities of pages back to the allocator ASAP? I think you are right, anything that uses the iommu_iotlb_gather may as well use RCU too. IIRC the allocators already know that RCU is often sitting on freed-memory and have some contigency to flush it out before OOMing, so nothing special should be needed. Fair enough. How about below code? static void pgtble_page_free_rcu(struct rcu_head *rcu) { struct page *page = container_of(rcu, struct page, rcu_head); __free_pages(page, 0); } /* * Free pages gathered in the freelist of iommu_iotlb_gather. Use RCU free * way so that it's safe for lock-free page table walk. */ void iommu_free_iotlb_gather_pages(struct iommu_iotlb_gather *iotlb_gather) { struct page *page, *next; list_for_each_entry_safe(page, next, _gather->freelist, lru) { list_del(>lru); call_rcu(>rcu_head, pgtble_page_free_rcu); } } Best regards, baolu ___ iommu mailing list iommu@lists.linux-foundation.org https://lists.linuxfoundation.org/mailman/listinfo/iommu
Re: [RFC PATCHES 1/2] iommu: Add RCU-protected page free support
On 2022/6/9 20:49, Jason Gunthorpe wrote: +void iommu_free_pgtbl_pages(struct iommu_domain *domain, + struct list_head *pages) +{ + struct page *page, *next; + + if (!domain->concurrent_traversal) { + put_pages_list(pages); + return; + } + + list_for_each_entry_safe(page, next, pages, lru) { + list_del(>lru); + call_rcu(>rcu_head, pgtble_page_free_rcu); + } It seems OK, but I wonder if there is benifit to using put_pages_list() from the rcu callback The price is that we need to allocate a "struct list_head" and free it in the rcu callback as well. Currently the list_head is sitting in the stack. Best regards, baolu ___ iommu mailing list iommu@lists.linux-foundation.org https://lists.linuxfoundation.org/mailman/listinfo/iommu
Re: [PATCH 0/5] Simplify vfio_iommu_type1 attach/detach routine
On 2022/6/7 19:58, Jason Gunthorpe wrote: On Tue, Jun 07, 2022 at 03:44:43PM +0800, Baolu Lu wrote: On 2022/6/6 14:19, Nicolin Chen wrote: Worths mentioning the exact match for enforce_cache_coherency is removed with this series, since there's very less value in doing that since KVM won't be able to take advantage of it -- this just wastes domain memory. Instead, we rely on Intel IOMMU driver taking care of that internally. After reading this series, I don't see that Intel IOMMU driver needs any further change to support the new scheme. Did I miss anything? You already did it :) Just as I thought. Thank you! Best regards, baolu ___ iommu mailing list iommu@lists.linux-foundation.org https://lists.linuxfoundation.org/mailman/listinfo/iommu
Re: [PATCH 0/5] Simplify vfio_iommu_type1 attach/detach routine
On 2022/6/6 14:19, Nicolin Chen wrote: Worths mentioning the exact match for enforce_cache_coherency is removed with this series, since there's very less value in doing that since KVM won't be able to take advantage of it -- this just wastes domain memory. Instead, we rely on Intel IOMMU driver taking care of that internally. After reading this series, I don't see that Intel IOMMU driver needs any further change to support the new scheme. Did I miss anything? Best regards, baolu ___ iommu mailing list iommu@lists.linux-foundation.org https://lists.linuxfoundation.org/mailman/listinfo/iommu
Re: [PATCH 1/5] iommu: Return -EMEDIUMTYPE for incompatible domain and device/group
On 2022/6/6 14:19, Nicolin Chen wrote: +/** + * iommu_attach_group - Attach an IOMMU group to an IOMMU domain + * @domain: IOMMU domain to attach + * @dev: IOMMU group that will be attached Nit: @group: ... + * + * Returns 0 on success and error code on failure + * + * Specifically, -EMEDIUMTYPE is returned if the domain and the group are + * incompatible in some way. This indicates that a caller should try another + * existing IOMMU domain or allocate a new one. + */ int iommu_attach_group(struct iommu_domain *domain, struct iommu_group *group) { int ret; Best regards, baolu ___ iommu mailing list iommu@lists.linux-foundation.org https://lists.linuxfoundation.org/mailman/listinfo/iommu
Re: [PATCH 09/12] iommu/vt-d: Check device list of domain in domain free path
On 2022/6/2 14:29, Tian, Kevin wrote: From: Baolu Lu Sent: Wednesday, June 1, 2022 7:02 PM On 2022/6/1 17:28, Tian, Kevin wrote: From: Lu Baolu Sent: Friday, May 27, 2022 2:30 PM When the IOMMU domain is about to be freed, it should not be set on any device. Instead of silently dealing with some bug cases, it's better to trigger a warning to report and fix any potential bugs at the first time. static void domain_exit(struct dmar_domain *domain) { - - /* Remove associated devices and clear attached or cached domains */ - domain_remove_dev_info(domain); + if (WARN_ON(!list_empty(>devices))) + return; warning is good but it doesn't mean the driver shouldn't deal with that situation to make it safer e.g. blocking DMA from all attached device... I have ever thought the same thing. :-) Blocking DMA from attached device should be done when setting blocking domain to the device. It should not be part of freeing a domain. yes but here we are talking about some bug scenario. Here, the caller asks the driver to free the domain, but the driver finds that something is wrong. Therefore, it warns and returns directly. The domain will still be there in use until the next set_domain(). at least it'd look safer if we always try to unmap the entire domain i.e.: static void domain_exit(struct dmar_domain *domain) { - - /* Remove associated devices and clear attached or cached domains */ - domain_remove_dev_info(domain); if (domain->pgd) { LIST_HEAD(freelist); domain_unmap(domain, 0, DOMAIN_MAX_PFN(domain->gaw), ); put_pages_list(); } + if (WARN_ON(!list_empty(>devices))) + return; kfree(domain); } Fair enough. Removing all mappings is safer. Best regards, baolu ___ iommu mailing list iommu@lists.linux-foundation.org https://lists.linuxfoundation.org/mailman/listinfo/iommu
Re: [PATCH 09/12] iommu/vt-d: Check device list of domain in domain free path
On 2022/6/1 17:28, Tian, Kevin wrote: From: Lu Baolu Sent: Friday, May 27, 2022 2:30 PM When the IOMMU domain is about to be freed, it should not be set on any device. Instead of silently dealing with some bug cases, it's better to trigger a warning to report and fix any potential bugs at the first time. static void domain_exit(struct dmar_domain *domain) { - - /* Remove associated devices and clear attached or cached domains */ - domain_remove_dev_info(domain); + if (WARN_ON(!list_empty(>devices))) + return; warning is good but it doesn't mean the driver shouldn't deal with that situation to make it safer e.g. blocking DMA from all attached device... I have ever thought the same thing. :-) Blocking DMA from attached device should be done when setting blocking domain to the device. It should not be part of freeing a domain. Here, the caller asks the driver to free the domain, but the driver finds that something is wrong. Therefore, it warns and returns directly. The domain will still be there in use until the next set_domain(). Best regards, baolu ___ iommu mailing list iommu@lists.linux-foundation.org https://lists.linuxfoundation.org/mailman/listinfo/iommu
Re: [PATCH 07/12] iommu/vt-d: Acquiring lock in pasid manipulation helpers
On 2022/6/1 17:18, Tian, Kevin wrote: From: Lu Baolu Sent: Friday, May 27, 2022 2:30 PM The iommu->lock is used to protect the per-IOMMU pasid directory table and pasid table. Move the spinlock acquisition/release into the helpers to make the code self-contained. Signed-off-by: Lu Baolu Reviewed-by: Kevin Tian , with one nit - /* Caller must ensure PASID entry is not in use. */ - if (pasid_pte_is_present(pte)) - return -EBUSY; + spin_lock(>lock); + pte = get_non_present_pasid_entry(dev, pasid); + if (!pte) { + spin_unlock(>lock); + return -ENODEV; + } I don't think above is a good abstraction and it changes the error code for an present entry from -EBUSY to -ENODEV. Sure. I will roll it back to -EBUSY. I added this helper because the same code appears at least three times in this file. Best regards, baolu ___ iommu mailing list iommu@lists.linux-foundation.org https://lists.linuxfoundation.org/mailman/listinfo/iommu
Re: [PATCH 06/12] iommu/vt-d: Acquiring lock in domain ID allocation helpers
Hi Kevin, Thank you for the comments. On 2022/6/1 17:09, Tian, Kevin wrote: From: Lu Baolu Sent: Friday, May 27, 2022 2:30 PM The iommu->lock is used to protect the per-IOMMU domain ID resource. Move the spinlock acquisition/release into the helpers where domain IDs are allocated and freed. The device_domain_lock is irrelevant to domain ID resources, remove its assertion as well. while moving the lock you also replace spin_lock_irqsave() with spin_lock(). It'd be cleaner to just do movement here and then replace all _irqsave() in patch 8. Yeah, that will be clearer. Best regards, baolu ___ iommu mailing list iommu@lists.linux-foundation.org https://lists.linuxfoundation.org/mailman/listinfo/iommu
Re: [PATCH v4 1/6] iommu: Add a per domain PASID for DMA API
On 2022/6/1 09:43, Tian, Kevin wrote: From: Jacob Pan Sent: Wednesday, June 1, 2022 1:30 AM In both cases the pasid is stored in the attach data instead of the domain. So during IOTLB flush for the domain, do we loop through the attach data? Yes and it's required. What does the attach data mean here? Do you mean group->pasid_array? Why not tracking the {device, pasid} info in the iommu driver when setting domain to {device, pasid}? We have tracked device in a list when setting a domain to device. Best regards, baolu ___ iommu mailing list iommu@lists.linux-foundation.org https://lists.linuxfoundation.org/mailman/listinfo/iommu
Re: [PATCH 01/12] iommu/vt-d: Use iommu_get_domain_for_dev() in debugfs
On 2022/6/1 02:51, Jason Gunthorpe wrote: Oh, I've spent the last couple of weeks hacking up horrible things manipulating entries in init_mm, and never realised that that was actually the special case. Oh well, live and learn. The init_mm is sort of different, it doesn't have zap in quite the same way, for example. I was talking about the typical process mm. Anyhow, the right solution is to use RCU as I described before, Baolu do you want to try? Yes, of course. Your discussion with Robin gave me a lot of inspiration. Very appreciated! I want to use a separate patch to solve this debugfs problem, because it has exceeded the original intention of this series. Best regards, baolu ___ iommu mailing list iommu@lists.linux-foundation.org https://lists.linuxfoundation.org/mailman/listinfo/iommu
Re: [PATCH 01/12] iommu/vt-d: Use iommu_get_domain_for_dev() in debugfs
On 2022/5/31 23:59, Jason Gunthorpe wrote: On Tue, May 31, 2022 at 02:52:28PM +0100, Robin Murphy wrote: + break; + pgtable_walk_level(m, phys_to_virt(phys_addr), Also, obligatory reminder that pfn_valid() only means that pfn_to_page() gets you a valid struct page. Whether that page is direct-mapped kernel memory or not is a different matter. Even though this is debugfs, if the operation is sketchy like that and can theortically crash the kernel the driver should test capabilities, CAP_SYS_RAWIO or something may be appropriate. I don't think we have a better cap for 'userspace may crash the kernel' Yes. We should test both CAP_SYS_ADMIN and CAP_SYS_RAWIO. Best regards, baolu ___ iommu mailing list iommu@lists.linux-foundation.org https://lists.linuxfoundation.org/mailman/listinfo/iommu
Re: [PATCH 01/12] iommu/vt-d: Use iommu_get_domain_for_dev() in debugfs
Hi Robin, Thank you for the comments. On 2022/5/31 21:52, Robin Murphy wrote: On 2022-05-31 04:02, Baolu Lu wrote: On 2022/5/30 20:14, Jason Gunthorpe wrote: On Sun, May 29, 2022 at 01:14:46PM +0800, Baolu Lu wrote: [--snip--] diff --git a/drivers/iommu/intel/debugfs.c b/drivers/iommu/intel/debugfs.c index d927ef10641b..e6f4835b8d9f 100644 --- a/drivers/iommu/intel/debugfs.c +++ b/drivers/iommu/intel/debugfs.c @@ -333,25 +333,28 @@ static void pgtable_walk_level(struct seq_file *m, struct dma_pte *pde, continue; path[level] = pde->val; - if (dma_pte_superpage(pde) || level == 1) + if (dma_pte_superpage(pde) || level == 1) { dump_page_info(m, start, path); - else - pgtable_walk_level(m, phys_to_virt(dma_pte_addr(pde)), + } else { + unsigned long phys_addr; + + phys_addr = (unsigned long)dma_pte_addr(pde); + if (!pfn_valid(__phys_to_pfn(phys_addr))) Given that pte_present(pde) passed just above, it was almost certainly a valid entry, so it seems unlikely that the physical address it pointed to could have disappeared in the meantime. If you're worried about the potential case where we've been preempted during this walk for long enough that the page has already been freed by an unmap, reallocated, Yes. This is exactly what I am worried about and what this patch wants to solve. and filled with someone else's data that happens to look like valid PTEs, this still isn't enough, since that data could just as well happen to look like valid physical addresses too. I imagine that if you want to safely walk pagetables concurrently with them potentially being freed, you'd probably need to get RCU involved. I don't want to make the map/unmap interface more complex or inefficient because of a debugfs feature. I hope that the debugfs and map/unmap interfaces are orthogonal, just like the IOMMU hardware traversing the page tables, as long as the accessed physical address is valid and accessible. Otherwise, stop the traversal immediately. If we can't achieve this, I'd rather stop supporting this debugfs node. + break; + pgtable_walk_level(m, phys_to_virt(phys_addr), Also, obligatory reminder that pfn_valid() only means that pfn_to_page() gets you a valid struct page. Whether that page is direct-mapped kernel memory or not is a different matter. Perhaps I can check this from the page flags? level - 1, start, path); + } path[level] = 0; } } -static int show_device_domain_translation(struct device *dev, void *data) +static int __show_device_domain_translation(struct device *dev, void *data) { struct device_domain_info *info = dev_iommu_priv_get(dev); struct dmar_domain *domain = info->domain; struct seq_file *m = data; u64 path[6] = { 0 }; - if (!domain) - return 0; - seq_printf(m, "Device %s @0x%llx\n", dev_name(dev), (u64)virt_to_phys(domain->pgd)); seq_puts(m, "IOVA_PFN\t\tPML5E\t\t\tPML4E\t\t\tPDPE\t\t\tPDE\t\t\tPTE\n"); @@ -359,20 +362,27 @@ static int show_device_domain_translation(struct device *dev, void *data) pgtable_walk_level(m, domain->pgd, domain->agaw + 2, 0, path); seq_putc(m, '\n'); - return 0; + return 1; } -static int domain_translation_struct_show(struct seq_file *m, void *unused) +static int show_device_domain_translation(struct device *dev, void *data) { - unsigned long flags; - int ret; + struct iommu_group *group; - spin_lock_irqsave(_domain_lock, flags); - ret = bus_for_each_dev(_bus_type, NULL, m, - show_device_domain_translation); - spin_unlock_irqrestore(_domain_lock, flags); + group = iommu_group_get(dev); + if (group) { + iommu_group_for_each_dev(group, data, + __show_device_domain_translation); Why group_for_each_dev? This will hold the group mutex when the callback is invoked. With the group mutex hold, the domain could not get changed. If there *are* multiple devices in the group then by definition they should be attached to the same domain, so dumping that domain's mappings more than once seems pointless. Especially given that the outer bus_for_each_dev iteration will already visit each individual device anyway, so this would only make the redundancy even worse than it already is. __show_device_domain_translation() only dumps mappings once as it always returns 1. Best regards, baolu ___ iommu mailing list iommu@lists.linux-foundation.org https://lists.linuxfoundation.org/mailman/listinfo/iommu
Re: [PATCH 01/12] iommu/vt-d: Use iommu_get_domain_for_dev() in debugfs
On 2022/5/31 21:10, Jason Gunthorpe wrote: On Tue, May 31, 2022 at 11:02:06AM +0800, Baolu Lu wrote: For case 2, it is a bit weird. I tried to add a rwsem lock to make the iommu_unmap() and dumping tables in debugfs exclusive. This does not work because debugfs may depend on the DMA of the devices to work. It seems that what we can do is to allow this race, but when we traverse the page table in debugfs, we will check the validity of the physical address retrieved from the page table entry. Then, the worst case is to print some useless information. Sounds horrible, don't you have locking around the IOPTEs of some kind? How does updating them work reliably? There's no locking around updating the IOPTEs. The basic assumption is that at any time, there's only a single thread manipulating the mappings of the range specified in iommu_map/unmap() APIs. Therefore, the race only exists when multiple ranges share some high-level IOPTEs. The IOMMU driver updates those IOPTEs using the compare-and-exchange atomic operation. It is just debugfs, so maybe it is not the end of the world, but still.. Fair enough. I think this is somewhat similar to that IOMMU hardware can traverse the page table at any time without considering when the CPUs update it. The IOMMU hardware will generate faults when it encounters failure during the traverse of page table. Similarly, perhaps debugfs could dump all-ones for an invalid IOPTE? Best regards, baolu ___ iommu mailing list iommu@lists.linux-foundation.org https://lists.linuxfoundation.org/mailman/listinfo/iommu
Re: [PATCH v4 1/6] iommu: Add a per domain PASID for DMA API
On 2022/5/31 18:12, Tian, Kevin wrote: +++ b/include/linux/iommu.h @@ -105,6 +105,8 @@ struct iommu_domain { enum iommu_page_response_code (*iopf_handler)(struct iommu_fault *fault, void *data); void *fault_data; + ioasid_t pasid; /* Used for DMA requests with PASID */ + atomic_t pasid_users; These are poorly named, this is really the DMA API global PASID and shouldn't be used for other things. Perhaps I misunderstood, do you mind explaining more? You still haven't really explained what this is for in this patch, maybe it just needs a better commit message, or maybe something is wrong. I keep saying the DMA API usage is not special, so why do we need to create a new global pasid and refcount? Realistically this is only going to be used by IDXD, why can't we just allocate a PASID and return it to the driver every time a driver asks for DMA API on PASI mode? Why does the core need to do anything special? Agree. I guess it was a mistake caused by treating ENQCMD as the only user although the actual semantics of the invented interfaces have already evolved to be quite general. This is very similar to what we have been discussing for iommufd. a PASID is just an additional routing info when attaching a device to an I/O address space (DMA API in this context) and by default it should be a per-device resource except when ENQCMD is explicitly opt in. Hence it's right time for us to develop common facility working for both this DMA API usage and iommufd, i.e.: for normal PASID attach to a domain, driver: allocates a local pasid from device local space; attaches the local pasid to a domain; for PASID attach in particular for ENQCMD, driver: allocates a global pasid in system-wide; attaches the global pasid to a domain; set the global pasid in PASID_MSR; In both cases the pasid is stored in the attach data instead of the domain. DMA API pasid is no special from above except it needs to allow one device attached to the same domain twice (one with RID and the other with RID+PASID). for iommufd those operations are initiated by userspace via iommufd uAPI. My understanding is that device driver owns its PASID policy. If ENQCMD is supported on the device, the PASIDs should be allocated through ioasid_alloc(). Otherwise, the whole PASID pool is managed by the device driver. For kernel DMA w/ PASID, after the driver has a PASID for this purpose, it can just set the default domain to the PASID on device. There's no need for enable/disable() interfaces. Best regards, baolu ___ iommu mailing list iommu@lists.linux-foundation.org https://lists.linuxfoundation.org/mailman/listinfo/iommu
Re: [PATCH RFC 09/19] iommu/amd: Access/Dirty bit support in IOPTEs
Hi Suravee , On 2022/5/31 19:34, Suravee Suthikulpanit wrote: On 4/29/22 4:09 AM, Joao Martins wrote: . +static int amd_iommu_set_dirty_tracking(struct iommu_domain *domain, + bool enable) +{ + struct protection_domain *pdomain = to_pdomain(domain); + struct iommu_dev_data *dev_data; + bool dom_flush = false; + + if (!amd_iommu_had_support) + return -EOPNOTSUPP; + + list_for_each_entry(dev_data, >dev_list, list) { Since we iterate through device list for the domain, we would need to call spin_lock_irqsave(>lock, flags) here. Not related, just out of curiosity. Does it really need to disable the interrupt while holding this lock? Any case this list would be traversed in any interrupt context? Perhaps I missed anything? Best regards, baolu ___ iommu mailing list iommu@lists.linux-foundation.org https://lists.linuxfoundation.org/mailman/listinfo/iommu
Re: [PATCH 01/12] iommu/vt-d: Use iommu_get_domain_for_dev() in debugfs
On 2022/5/30 20:14, Jason Gunthorpe wrote: On Sun, May 29, 2022 at 01:14:46PM +0800, Baolu Lu wrote: From 1e87b5df40c6ce9414cdd03988c3b52bfb17af5f Mon Sep 17 00:00:00 2001 From: Lu Baolu Date: Sun, 29 May 2022 10:18:56 +0800 Subject: [PATCH 1/1] iommu/vt-d: debugfs: Remove device_domain_lock usage The domain_translation_struct debugfs node is used to dump static mappings of PCI devices. It potentially races with setting new domains to devices and the iommu_map/unmap() interfaces. The existing code tries to use the global spinlock device_domain_lock to avoid the races, but this is problematical as this lock is only used to protect the device tracking lists of the domains. Instead of using an immature lock to cover up the problem, it's better to explicitly restrict the use of this debugfs node. This also makes device_domain_lock static. What does "explicitly restrict" mean? I originally thought about adding restrictions on this interface to a document. But obviously, this is a naive idea. :-) I went over the code again. The races exist in two paths: 1. Dump the page table in use while setting a new page table to the device. 2. A high-level page table entry has been marked as non-present, but the dumping code has walked down to the low-level tables. For case 1, we can try to solve it by dumping tables while holding the group->mutex. For case 2, it is a bit weird. I tried to add a rwsem lock to make the iommu_unmap() and dumping tables in debugfs exclusive. This does not work because debugfs may depend on the DMA of the devices to work. It seems that what we can do is to allow this race, but when we traverse the page table in debugfs, we will check the validity of the physical address retrieved from the page table entry. Then, the worst case is to print some useless information. The real code looks like this: From 3feb0727f9d7095729ef75ab1967270045b3a38c Mon Sep 17 00:00:00 2001 From: Lu Baolu Date: Sun, 29 May 2022 10:18:56 +0800 Subject: [PATCH 1/1] iommu/vt-d: debugfs: Remove device_domain_lock usage The domain_translation_struct debugfs node is used to dump the DMAR page tables for the PCI devices. It potentially races with setting domains to devices and the iommu_unmap() interface. The existing code uses a global spinlock device_domain_lock to avoid the races, but this is problematical as this lock is only used to protect the device tracking lists of each domain. This replaces device_domain_lock with group->mutex to protect the traverse of the page tables from setting a new domain and always check the physical address retrieved from the page table entry before traversing to the next- level page table. As a cleanup, this also makes device_domain_lock static. Signed-off-by: Lu Baolu --- drivers/iommu/intel/debugfs.c | 42 ++- drivers/iommu/intel/iommu.c | 2 +- drivers/iommu/intel/iommu.h | 1 - 3 files changed, 27 insertions(+), 18 deletions(-) diff --git a/drivers/iommu/intel/debugfs.c b/drivers/iommu/intel/debugfs.c index d927ef10641b..e6f4835b8d9f 100644 --- a/drivers/iommu/intel/debugfs.c +++ b/drivers/iommu/intel/debugfs.c @@ -333,25 +333,28 @@ static void pgtable_walk_level(struct seq_file *m, struct dma_pte *pde, continue; path[level] = pde->val; - if (dma_pte_superpage(pde) || level == 1) + if (dma_pte_superpage(pde) || level == 1) { dump_page_info(m, start, path); - else - pgtable_walk_level(m, phys_to_virt(dma_pte_addr(pde)), + } else { + unsigned long phys_addr; + + phys_addr = (unsigned long)dma_pte_addr(pde); + if (!pfn_valid(__phys_to_pfn(phys_addr))) + break; + pgtable_walk_level(m, phys_to_virt(phys_addr), level - 1, start, path); + } path[level] = 0; } } -static int show_device_domain_translation(struct device *dev, void *data) +static int __show_device_domain_translation(struct device *dev, void *data) { struct device_domain_info *info = dev_iommu_priv_get(dev); struct dmar_domain *domain = info->domain; struct seq_file *m = data; u64 path[6] = { 0 }; - if (!domain) - return 0; - seq_printf(m, "Device %s @0x%llx\n", dev_name(dev), (u64)virt_to_phys(domain->pgd)); seq_puts(m, "IOVA_PFN\t\tPML5E\t\t\tPML4E\t\t\tPDPE\t\t\tPDE\t\t\tPTE\n"); @@ -359,20 +362,27 @@ static int show_device_domain_translation(struct device *dev, void *data) pgtable_walk_level(m, domain->pgd, domain->agaw + 2, 0, path); seq_putc(m, '\n'); - return 0; + return 1; } -static int domain_translation_struct_show(struct seq_file *m, void *unused
Re: [PATCH 03/12] iommu/vt-d: Remove clearing translation data in disable_dmar_iommu()
On 2022/5/27 23:01, Jason Gunthorpe wrote: On Fri, May 27, 2022 at 02:30:10PM +0800, Lu Baolu wrote: The disable_dmar_iommu() is called when IOMMU initialzation fails or the IOMMU is hot-removed from the system. In both cases, there is no need to clear the IOMMU translation data structures for devices. On the initialization path, the device probing only happens after the IOMMU is initialized successfully, hence there're no translation data structures. On the hot-remove path, there is no real use case where the IOMMU is hot-removed, but the devices that it manages are still alive in the system. The translation data structures were torn down during device release, hence there's no need to repeat it in IOMMU hot-remove path either. Can you leave behind a 1 statement WARN_ON of some kind to check this? Sure. As the default domain is the first domain allocated for a device and the last one freed. We can WARN_ON the case where there's still domain IDs in use. How do you like this? + /* +* All iommu domains must have been detached from the devices, +* hence there should be no domain IDs in use. +*/ + if (WARN_ON(bitmap_weight(iommu->domain_ids, cap_ndoms(iommu->cap)) + != NUM_RESERVED_DID)) + return; Best regards, baolu ___ iommu mailing list iommu@lists.linux-foundation.org https://lists.linuxfoundation.org/mailman/listinfo/iommu
Re: [PATCH 01/12] iommu/vt-d: Use iommu_get_domain_for_dev() in debugfs
On 2022/5/27 22:59, Jason Gunthorpe wrote: On Fri, May 27, 2022 at 02:30:08PM +0800, Lu Baolu wrote: Retrieve the attached domain for a device through the generic interface exposed by the iommu core. This also makes device_domain_lock static. Signed-off-by: Lu Baolu drivers/iommu/intel/iommu.h | 1 - drivers/iommu/intel/debugfs.c | 20 drivers/iommu/intel/iommu.c | 2 +- 3 files changed, 9 insertions(+), 14 deletions(-) diff --git a/drivers/iommu/intel/iommu.h b/drivers/iommu/intel/iommu.h index a22adfbdf870..8a6d64d726c0 100644 +++ b/drivers/iommu/intel/iommu.h @@ -480,7 +480,6 @@ enum { #define VTD_FLAG_SVM_CAPABLE (1 << 2) extern int intel_iommu_sm; -extern spinlock_t device_domain_lock; #define sm_supported(iommu) (intel_iommu_sm && ecap_smts((iommu)->ecap)) #define pasid_supported(iommu)(sm_supported(iommu) && \ diff --git a/drivers/iommu/intel/debugfs.c b/drivers/iommu/intel/debugfs.c index d927ef10641b..eea8727aa7bc 100644 +++ b/drivers/iommu/intel/debugfs.c @@ -344,19 +344,21 @@ static void pgtable_walk_level(struct seq_file *m, struct dma_pte *pde, static int show_device_domain_translation(struct device *dev, void *data) { - struct device_domain_info *info = dev_iommu_priv_get(dev); - struct dmar_domain *domain = info->domain; + struct dmar_domain *dmar_domain; + struct iommu_domain *domain; struct seq_file *m = data; u64 path[6] = { 0 }; + domain = iommu_get_domain_for_dev(dev); if (!domain) return 0; The iommu_get_domain_for_dev() API should be called something like 'iommu_get_dma_api_domain()' and clearly documented that it is safe to call only so long as a DMA API using driver is attached to the device, which is most of the current callers. Yes, agreed. This use in random sysfs inside the iommu driver is not OK because it doesn't have any locking protecting domain from concurrent free. This is not sysfs, but debugfs. The description of this patch is confusing. I should make it specific and straight-forward. How about below one? From 1e87b5df40c6ce9414cdd03988c3b52bfb17af5f Mon Sep 17 00:00:00 2001 From: Lu Baolu Date: Sun, 29 May 2022 10:18:56 +0800 Subject: [PATCH 1/1] iommu/vt-d: debugfs: Remove device_domain_lock usage The domain_translation_struct debugfs node is used to dump static mappings of PCI devices. It potentially races with setting new domains to devices and the iommu_map/unmap() interfaces. The existing code tries to use the global spinlock device_domain_lock to avoid the races, but this is problematical as this lock is only used to protect the device tracking lists of the domains. Instead of using an immature lock to cover up the problem, it's better to explicitly restrict the use of this debugfs node. This also makes device_domain_lock static. Signed-off-by: Lu Baolu --- drivers/iommu/intel/debugfs.c | 17 - drivers/iommu/intel/iommu.c | 2 +- drivers/iommu/intel/iommu.h | 1 - 3 files changed, 9 insertions(+), 11 deletions(-) diff --git a/drivers/iommu/intel/debugfs.c b/drivers/iommu/intel/debugfs.c index d927ef10641b..9642e3e9d6b0 100644 --- a/drivers/iommu/intel/debugfs.c +++ b/drivers/iommu/intel/debugfs.c @@ -362,17 +362,16 @@ static int show_device_domain_translation(struct device *dev, void *data) return 0; } +/* + * Dump the static mappings of PCI devices. This is only for DEBUGFS code, + * don't use it for other purposes. It potentially races with setting new + * domains to devices and iommu_map/unmap(). Use the trace events under + * /sys/kernel/debug/tracing/events/iommu/ for dynamic debugging. + */ static int domain_translation_struct_show(struct seq_file *m, void *unused) { - unsigned long flags; - int ret; - - spin_lock_irqsave(_domain_lock, flags); - ret = bus_for_each_dev(_bus_type, NULL, m, - show_device_domain_translation); - spin_unlock_irqrestore(_domain_lock, flags); - - return ret; + return bus_for_each_dev(_bus_type, NULL, m, + show_device_domain_translation); } DEFINE_SHOW_ATTRIBUTE(domain_translation_struct); diff --git a/drivers/iommu/intel/iommu.c b/drivers/iommu/intel/iommu.c index 1af4b6562266..cacae8bdaa65 100644 --- a/drivers/iommu/intel/iommu.c +++ b/drivers/iommu/intel/iommu.c @@ -314,7 +314,7 @@ static int iommu_skip_te_disable; #define IDENTMAP_GFX 2 #define IDENTMAP_AZALIA4 -DEFINE_SPINLOCK(device_domain_lock); +static DEFINE_SPINLOCK(device_domain_lock); static LIST_HEAD(device_domain_list); /* diff --git a/drivers/iommu/intel/iommu.h b/drivers/iommu/intel/iommu.h index a22adfbdf870..8a6d64d726c0 100644 --- a/drivers/iommu/intel/iommu.h +++ b/drivers/iommu/intel/iommu.h @@ -480,7 +480,6 @@ enum { #define VTD_FLAG_SVM_CAPABLE (1 << 2) extern int intel_iommu_sm; -extern
Re: [PATCH v7 03/10] iommu/sva: Add iommu_sva_domain support
On 2022/5/25 23:25, Jason Gunthorpe wrote: On Wed, May 25, 2022 at 01:19:08PM +0800, Baolu Lu wrote: Hi Jason, On 2022/5/24 21:44, Jason Gunthorpe wrote: diff --git a/drivers/iommu/iommu-sva-lib.c b/drivers/iommu/iommu-sva-lib.c index 106506143896..210c376f6043 100644 +++ b/drivers/iommu/iommu-sva-lib.c @@ -69,3 +69,51 @@ struct mm_struct *iommu_sva_find(ioasid_t pasid) return ioasid_find(_sva_pasid, pasid, __mmget_not_zero); } EXPORT_SYMBOL_GPL(iommu_sva_find); + +/* + * IOMMU SVA driver-oriented interfaces + */ +struct iommu_domain * +iommu_sva_alloc_domain(struct bus_type *bus, struct mm_struct *mm) This should return the proper type Can you please elaborate a bit on "return the proper type"? Did you mean return iommu_sva_domain instead? That's a wrapper of iommu_domain and only for iommu internal usage. If you are exposing special SVA APIs then it is not internal usage only anymore, so expose the type. Ah, got you. Thanks for the explanation. Best regards, baolu ___ iommu mailing list iommu@lists.linux-foundation.org https://lists.linuxfoundation.org/mailman/listinfo/iommu
Re: [PATCH v7 03/10] iommu/sva: Add iommu_sva_domain support
On 2022/5/25 19:06, Jean-Philippe Brucker wrote: On Wed, May 25, 2022 at 11:07:49AM +0100, Robin Murphy wrote: Did you mean @handler and @handler_token staffs below? struct iommu_domain { unsigned type; const struct iommu_domain_ops *ops; unsigned long pgsize_bitmap; /* Bitmap of page sizes in use */ iommu_fault_handler_t handler; void *handler_token; struct iommu_domain_geometry geometry; struct iommu_dma_cookie *iova_cookie; }; Is it only for DMA domains? From the point view of IOMMU faults, it seems to be generic. Yes, it's the old common iommu_set_fault_handler() stuff (which arguably is more of a "notifier" than a "handler"), but I assume that that's irrelevant if SVA is using IOPF instead? Yes IOMMU drivers call either the newer iommu_report_device_fault() or the old report_iommu_fault(), and only the former can support IOPF/SVA. I've tried to merge them before but never completed it. I think the main issue was with finding the endpoint that caused the fault from the fault handler. Some IOMMU drivers just pass the IOMMU device to report_iommu_fault(). I'll probably pick that up at some point. Thank you all for the comments and suggestions. Below is the refreshed patch. Hope that I didn't miss anything. From 463c04cada8e8640598f981d8d16157781b9de6f Mon Sep 17 00:00:00 2001 From: Lu Baolu Date: Wed, 11 May 2022 20:59:24 +0800 Subject: [PATCH 04/11] iommu: Add sva iommu_domain support The sva iommu_domain represents a hardware pagetable that the IOMMU hardware could use for SVA translation. This adds some infrastructure to support SVA domain in the iommu common layer. It includes: - Extend the iommu_domain to support a new IOMMU_DOMAIN_SVA domain type. The IOMMU drivers that support SVA should provide the sva domain specific iommu_domain_ops. - Add a helper to allocate an SVA domain. The iommu_domain_free() is still used to free an SVA domain. - Add helpers to attach an SVA domain to a device and the reverse operation. Some buses, like PCI, route packets without considering the PASID value. Thus a DMA target address with PASID might be treated as P2P if the address falls into the MMIO BAR of other devices in the group. To make things simple, the attach/detach interfaces only apply to devices belonging to the singleton groups, and the singleton is immutable in fabric i.e. not affected by hotplug. The iommu_attach/detach_device_pasid() can be used for other purposes, such as kernel DMA with pasid, mediation device, etc. Suggested-by: Jean-Philippe Brucker Suggested-by: Jason Gunthorpe Signed-off-by: Lu Baolu Reviewed-by: Jean-Philippe Brucker --- drivers/iommu/iommu.c | 93 +++ include/linux/iommu.h | 45 - 2 files changed, 136 insertions(+), 2 deletions(-) diff --git a/drivers/iommu/iommu.c b/drivers/iommu/iommu.c index 63b64b4e8a38..b1a2ad64a413 100644 --- a/drivers/iommu/iommu.c +++ b/drivers/iommu/iommu.c @@ -27,6 +27,7 @@ #include #include #include +#include static struct kset *iommu_group_kset; static DEFINE_IDA(iommu_group_ida); @@ -39,6 +40,7 @@ struct iommu_group { struct kobject kobj; struct kobject *devices_kobj; struct list_head devices; + struct xarray pasid_array; struct mutex mutex; void *iommu_data; void (*iommu_data_release)(void *iommu_data); @@ -666,6 +668,7 @@ struct iommu_group *iommu_group_alloc(void) mutex_init(>mutex); INIT_LIST_HEAD(>devices); INIT_LIST_HEAD(>entry); + xa_init(>pasid_array); ret = ida_simple_get(_group_ida, 0, 0, GFP_KERNEL); if (ret < 0) { @@ -1961,6 +1964,8 @@ EXPORT_SYMBOL_GPL(iommu_domain_alloc); void iommu_domain_free(struct iommu_domain *domain) { + if (domain->type == IOMMU_DOMAIN_SVA) + mmdrop(domain->mm); iommu_put_dma_cookie(domain); domain->ops->free(domain); } @@ -3277,3 +3282,91 @@ bool iommu_group_dma_owner_claimed(struct iommu_group *group) return user; } EXPORT_SYMBOL_GPL(iommu_group_dma_owner_claimed); + +struct iommu_domain *iommu_sva_domain_alloc(struct device *dev, + struct mm_struct *mm) +{ + const struct iommu_ops *ops = dev_iommu_ops(dev); + struct iommu_domain *domain; + + domain = ops->domain_alloc(IOMMU_DOMAIN_SVA); + if (!domain) + return NULL; + + domain->type = IOMMU_DOMAIN_SVA; + mmgrab(mm); + domain->mm = mm; + + return domain; +} + +static bool iommu_group_immutable_singleton(struct iommu_group *group, + struct device *dev) +{ + int count; + + mutex_lock(>mutex); + count = iommu_group_device_count(group); + mutex_unlock(>mutex); + + if (count != 1) + return false; + + /* +* The PCI device could be considered to
Re: [PATCH 1/3] iommu: Add Visconti5 IOMMU driver
On 2022/5/25 09:31, Nobuhiro Iwamatsu wrote: +static const struct iommu_ops visconti_atu_ops = { + .domain_alloc = visconti_atu_domain_alloc, + .probe_device = visconti_atu_probe_device, + .release_device = visconti_atu_release_device, + .device_group = generic_device_group, + .of_xlate = visconti_atu_of_xlate, + .pgsize_bitmap = ATU_IOMMU_PGSIZE_BITMAP, + .default_domain_ops = &(const struct iommu_domain_ops) { + .attach_dev = visconti_atu_attach_device, + .detach_dev = visconti_atu_detach_device, The detach_dev callback is about to be deprecated. The new drivers should implement the default domain and blocking domain instead. + .map = visconti_atu_map, + .unmap = visconti_atu_unmap, + .iova_to_phys = visconti_atu_iova_to_phys, + .free = visconti_atu_domain_free, + } +}; Best regards, baolu ___ iommu mailing list iommu@lists.linux-foundation.org https://lists.linuxfoundation.org/mailman/listinfo/iommu
Re: [PATCH v7 03/10] iommu/sva: Add iommu_sva_domain support
Hi Robin, On 2022/5/24 22:36, Robin Murphy wrote: On 2022-05-19 08:20, Lu Baolu wrote: [...] diff --git a/drivers/iommu/iommu-sva-lib.c b/drivers/iommu/iommu-sva-lib.c index 106506143896..210c376f6043 100644 --- a/drivers/iommu/iommu-sva-lib.c +++ b/drivers/iommu/iommu-sva-lib.c @@ -69,3 +69,51 @@ struct mm_struct *iommu_sva_find(ioasid_t pasid) return ioasid_find(_sva_pasid, pasid, __mmget_not_zero); } EXPORT_SYMBOL_GPL(iommu_sva_find); + +/* + * IOMMU SVA driver-oriented interfaces + */ +struct iommu_domain * +iommu_sva_alloc_domain(struct bus_type *bus, struct mm_struct *mm) Argh, please no new bus-based external interfaces! Domain allocation needs to resolve to the right IOMMU instance to solve a number of issues, and cleaning up existing users of iommu_domain_alloc() to prepare for that is already hard enough. This is arguably even more relevant here than for other domain types, since SVA support is more likely to depend on specific features that can vary between IOMMU instances even with the same driver. Please make the external interface take a struct device, then resolve the ops through dev->iommu. Further nit: the naming inconsistency bugs me a bit - iommu_sva_domain_alloc() seems more natural. Also I'd question the symmetry vs. usability dichotomy of whether we *really* want two different free functions for a struct iommu_domain pointer, where any caller which might mix SVA and non-SVA usage then has to remember how they allocated any particular domain :/ +{ + struct iommu_sva_domain *sva_domain; + struct iommu_domain *domain; + + if (!bus->iommu_ops || !bus->iommu_ops->sva_domain_ops) + return ERR_PTR(-ENODEV); + + sva_domain = kzalloc(sizeof(*sva_domain), GFP_KERNEL); + if (!sva_domain) + return ERR_PTR(-ENOMEM); + + mmgrab(mm); + sva_domain->mm = mm; + + domain = _domain->domain; + domain->type = IOMMU_DOMAIN_SVA; + domain->ops = bus->iommu_ops->sva_domain_ops; I'd have thought it would be logical to pass IOMMU_DOMAIN_SVA to the normal domain_alloc call, so that driver-internal stuff like context descriptors can be still be hung off the domain as usual (rather than all drivers having to implement some extra internal lookup mechanism to handle all the SVA domain ops), but that's something we're free to come Agreed with above comments. Thanks! I will post an additional patch for review later. back and change later. FWIW I'd just stick the mm pointer in struct iommu_domain, in a union with the fault handler stuff and/or iova_cookie - those are mutually exclusive with SVA, right? "iova_cookie" is mutually exclusive with SVA, but I am not sure about the fault handler stuff. Did you mean @handler and @handler_token staffs below? struct iommu_domain { unsigned type; const struct iommu_domain_ops *ops; unsigned long pgsize_bitmap;/* Bitmap of page sizes in use */ iommu_fault_handler_t handler; void *handler_token; struct iommu_domain_geometry geometry; struct iommu_dma_cookie *iova_cookie; }; Is it only for DMA domains? From the point view of IOMMU faults, it seems to be generic. Best regards, baolu ___ iommu mailing list iommu@lists.linux-foundation.org https://lists.linuxfoundation.org/mailman/listinfo/iommu