Add a para-virtualized IOMMU driver for Linux guests running on Hyper-V. This driver implements stage-1 IO translation within the guest OS. It integrates with the Linux IOMMU core, utilizing Hyper-V hypercalls for: - Capability discovery - Domain allocation, configuration, and deallocation - Device attachment and detachment - IOTLB invalidation
The driver constructs x86-compatible stage-1 IO page tables in the guest memory using consolidated IO page table helpers. This allows the guest to manage stage-1 translations independently of vendor- specific drivers (like Intel VT-d or AMD IOMMU). Hyper-V consumes this stage-1 IO page table when a device domain is created and configured, and nests it with the host's stage-2 IO page tables, therefore eliminating the VM exits for guest IOMMU mapping operations. For unmapping operations, VM exits to perform the IOTLB flush are still unavoidable. To identify a device in its hypercall interface, the driver looks up the logical device ID prefix registered for the device's PCI domain (see the logical device ID registry in hv_common.c) and combines it with the PCI function number of the endpoint device. Co-developed-by: Wei Liu <[email protected]> Signed-off-by: Wei Liu <[email protected]> Co-developed-by: Easwar Hariharan <[email protected]> Signed-off-by: Easwar Hariharan <[email protected]> Signed-off-by: Yu Zhang <[email protected]> --- arch/x86/hyperv/hv_init.c | 4 + arch/x86/include/asm/mshyperv.h | 4 + drivers/iommu/Kconfig | 1 + drivers/iommu/hyperv/Kconfig | 16 + drivers/iommu/hyperv/Makefile | 1 + drivers/iommu/hyperv/iommu.c | 620 ++++++++++++++++++++++++++++++++ drivers/iommu/hyperv/iommu.h | 51 +++ 7 files changed, 697 insertions(+) create mode 100644 drivers/iommu/hyperv/Kconfig create mode 100644 drivers/iommu/hyperv/iommu.c create mode 100644 drivers/iommu/hyperv/iommu.h diff --git a/arch/x86/hyperv/hv_init.c b/arch/x86/hyperv/hv_init.c index 55a8b6de2865..094f9f7ddb72 100644 --- a/arch/x86/hyperv/hv_init.c +++ b/arch/x86/hyperv/hv_init.c @@ -578,6 +578,10 @@ void __init hyperv_init(void) old_setup_percpu_clockev = x86_init.timers.setup_percpu_clockev; x86_init.timers.setup_percpu_clockev = hv_stimer_setup_percpu_clockev; +#ifdef CONFIG_HYPERV_PVIOMMU + x86_init.iommu.iommu_init = hv_iommu_init; +#endif + hv_apic_init(); x86_init.pci.arch_init = hv_pci_init; diff --git a/arch/x86/include/asm/mshyperv.h b/arch/x86/include/asm/mshyperv.h index f64393e853ee..20d947c2c758 100644 --- a/arch/x86/include/asm/mshyperv.h +++ b/arch/x86/include/asm/mshyperv.h @@ -313,6 +313,10 @@ static inline void mshv_vtl_return_hypercall(void) {} static inline void __mshv_vtl_return_call(struct mshv_vtl_cpu_context *vtl0) {} #endif +#ifdef CONFIG_HYPERV_PVIOMMU +int __init hv_iommu_init(void); +#endif + #include <asm-generic/mshyperv.h> #endif diff --git a/drivers/iommu/Kconfig b/drivers/iommu/Kconfig index 6e07bd69467a..0d128f377929 100644 --- a/drivers/iommu/Kconfig +++ b/drivers/iommu/Kconfig @@ -195,6 +195,7 @@ config MSM_IOMMU source "drivers/iommu/amd/Kconfig" source "drivers/iommu/arm/Kconfig" source "drivers/iommu/intel/Kconfig" +source "drivers/iommu/hyperv/Kconfig" source "drivers/iommu/iommufd/Kconfig" source "drivers/iommu/riscv/Kconfig" diff --git a/drivers/iommu/hyperv/Kconfig b/drivers/iommu/hyperv/Kconfig new file mode 100644 index 000000000000..8b6abbaaf9b8 --- /dev/null +++ b/drivers/iommu/hyperv/Kconfig @@ -0,0 +1,16 @@ +# SPDX-License-Identifier: GPL-2.0-only +# HyperV paravirtualized IOMMU support +config HYPERV_PVIOMMU + bool "Microsoft Hypervisor para-virtualized IOMMU support" + depends on X86_64 && HYPERV + select IOMMU_API + select GENERIC_PT + select IOMMU_PT + select IOMMU_PT_X86_64 + select IOMMU_IOVA + default HYPERV + help + Para-virtualized IOMMU driver for Linux guests running on + Microsoft Hyper-V. Provides DMA remapping and IOTLB + flush support to enable DMA isolation for devices + assigned to the guest. diff --git a/drivers/iommu/hyperv/Makefile b/drivers/iommu/hyperv/Makefile index 6ef0ef97f3dd..fefb409d976b 100644 --- a/drivers/iommu/hyperv/Makefile +++ b/drivers/iommu/hyperv/Makefile @@ -1,2 +1,3 @@ # SPDX-License-Identifier: GPL-2.0 obj-$(CONFIG_IRQ_REMAP) += hv-irq-remap-x86.o +obj-$(CONFIG_HYPERV_PVIOMMU) += iommu.o diff --git a/drivers/iommu/hyperv/iommu.c b/drivers/iommu/hyperv/iommu.c new file mode 100644 index 000000000000..254136946404 --- /dev/null +++ b/drivers/iommu/hyperv/iommu.c @@ -0,0 +1,620 @@ +// SPDX-License-Identifier: GPL-2.0 + +/* + * Hyper-V IOMMU driver. + * + * Copyright (C) 2019, 2024-2026 Microsoft, Inc. + */ + +#define pr_fmt(fmt) "Hyper-V pvIOMMU: " fmt +#define dev_fmt(fmt) pr_fmt(fmt) + +#include <linux/iommu.h> +#include <linux/pci.h> +#include <linux/dma-map-ops.h> +#include <linux/generic_pt/iommu.h> +#include <linux/pci-ats.h> + +#include <asm/iommu.h> +#include <asm/hypervisor.h> +#include <asm/mshyperv.h> + +#include "iommu.h" +#include "../iommu-pages.h" + +struct hv_iommu_dev *hv_iommu_device; + +/* + * Identity and blocking domains are static singletons: identity is a 1:1 + * passthrough with no page table, blocking rejects all DMA. Neither holds + * per-IOMMU state, so one instance suffices even with multiple vIOMMUs. + */ +static const struct iommu_domain_ops hv_iommu_identity_domain_ops; +static const struct iommu_domain_ops hv_iommu_blocking_domain_ops; +static struct iommu_ops hv_iommu_ops; + +static struct hv_iommu_domain hv_identity_domain = { + .domain = { + .type = IOMMU_DOMAIN_IDENTITY, + .ops = &hv_iommu_identity_domain_ops, + .owner = &hv_iommu_ops, + }, +}; +static struct hv_iommu_domain hv_blocking_domain = { + .domain = { + .type = IOMMU_DOMAIN_BLOCKED, + .ops = &hv_iommu_blocking_domain_ops, + .owner = &hv_iommu_ops, + }, +}; + +static inline bool hv_iommu_present(u64 cap) +{ + return cap & HV_IOMMU_CAP_PRESENT; +} + +static inline bool hv_iommu_s1_domain_supported(u64 cap) +{ + return cap & HV_IOMMU_CAP_S1; +} + +static inline bool hv_iommu_5lvl_supported(u64 cap) +{ + return cap & HV_IOMMU_CAP_S1_5LVL; +} + +static inline bool hv_iommu_ats_supported(u64 cap) +{ + return cap & HV_IOMMU_CAP_ATS; +} + +static int hv_create_device_domain(struct hv_iommu_domain *hv_domain, u32 domain_stage) +{ + int ret; + u64 status; + unsigned long flags; + struct hv_input_create_device_domain *input; + + ret = ida_alloc_range(&hv_iommu_device->domain_ids, + hv_iommu_device->first_domain, hv_iommu_device->last_domain, + GFP_KERNEL); + if (ret < 0) + return ret; + + hv_domain->device_domain.partition_id = HV_PARTITION_ID_SELF; + hv_domain->device_domain.domain_id.type = domain_stage; + hv_domain->device_domain.domain_id.id = ret; + hv_domain->hv_iommu = hv_iommu_device; + + local_irq_save(flags); + + input = *this_cpu_ptr(hyperv_pcpu_input_arg); + memset(input, 0, sizeof(*input)); + input->device_domain = hv_domain->device_domain; + input->create_device_domain_flags.forward_progress_required = 1; + input->create_device_domain_flags.inherit_owning_vtl = 0; + status = hv_do_hypercall(HVCALL_CREATE_DEVICE_DOMAIN, input, NULL); + + local_irq_restore(flags); + + if (!hv_result_success(status)) { + pr_err("HVCALL_CREATE_DEVICE_DOMAIN failed, status %lld\n", status); + ida_free(&hv_iommu_device->domain_ids, hv_domain->device_domain.domain_id.id); + } + + return hv_result_to_errno(status); +} + +static void hv_delete_device_domain(struct hv_iommu_domain *hv_domain) +{ + u64 status; + unsigned long flags; + struct hv_input_delete_device_domain *input; + + local_irq_save(flags); + + input = *this_cpu_ptr(hyperv_pcpu_input_arg); + memset(input, 0, sizeof(*input)); + input->device_domain = hv_domain->device_domain; + status = hv_do_hypercall(HVCALL_DELETE_DEVICE_DOMAIN, input, NULL); + + local_irq_restore(flags); + + if (!hv_result_success(status)) + pr_err("HVCALL_DELETE_DEVICE_DOMAIN failed, status %lld\n", status); + + ida_free(&hv_domain->hv_iommu->domain_ids, hv_domain->device_domain.domain_id.id); +} + +static bool hv_iommu_capable(struct device *dev, enum iommu_cap cap) +{ + switch (cap) { + case IOMMU_CAP_CACHE_COHERENCY: + return true; + case IOMMU_CAP_DEFERRED_FLUSH: + return true; + default: + return false; + } +} + +static void hv_flush_device_domain(struct hv_iommu_domain *hv_domain) +{ + u64 status; + unsigned long flags; + struct hv_input_flush_device_domain *input; + + local_irq_save(flags); + + input = *this_cpu_ptr(hyperv_pcpu_input_arg); + memset(input, 0, sizeof(*input)); + input->device_domain = hv_domain->device_domain; + status = hv_do_hypercall(HVCALL_FLUSH_DEVICE_DOMAIN, input, NULL); + + local_irq_restore(flags); + + if (!hv_result_success(status)) + pr_err("HVCALL_FLUSH_DEVICE_DOMAIN failed, status %lld\n", status); +} + +static int hv_iommu_attach_dev(struct iommu_domain *domain, struct device *dev, + struct iommu_domain *old) +{ + u64 status; + u32 prefix; + unsigned long flags; + struct pci_dev *pdev; + struct hv_input_attach_device_domain *input; + struct hv_iommu_endpoint *vdev = dev_iommu_priv_get(dev); + struct hv_iommu_domain *hv_domain = to_hv_iommu_domain(domain); + int ret; + + if (vdev->hv_domain == hv_domain) + return 0; + + pdev = to_pci_dev(dev); + dev_dbg(dev, "attaching to domain %d\n", + hv_domain->device_domain.domain_id.id); + + ret = hv_iommu_lookup_logical_dev_id(pci_domain_nr(pdev->bus), &prefix); + if (ret) { + dev_err(&pdev->dev, "no IOMMU registration for vPCI bus\n"); + return ret; + } + + local_irq_save(flags); + + input = *this_cpu_ptr(hyperv_pcpu_input_arg); + memset(input, 0, sizeof(*input)); + input->device_domain = hv_domain->device_domain; + input->device_id.as_uint64 = (u64)prefix | PCI_FUNC(pdev->devfn); + status = hv_do_hypercall(HVCALL_ATTACH_DEVICE_DOMAIN, input, NULL); + + local_irq_restore(flags); + + if (!hv_result_success(status)) + pr_err("HVCALL_ATTACH_DEVICE_DOMAIN failed, status %lld\n", status); + else + vdev->hv_domain = hv_domain; + + return hv_result_to_errno(status); +} + +static int hv_iommu_blocking_attach_dev(struct iommu_domain *domain, + struct device *dev, + struct iommu_domain *old) +{ + int ret = hv_iommu_attach_dev(domain, dev, old); + + /* + * Attaching to the blocking domain only asks the hypervisor to + * disable translation and IOPF for the device, so it cannot fail + * unless there is a driver or hypervisor bug. Return the hypercall + * status rather than 0 so that a failure on the DMA ownership claim + * path (VFIO/iommufd) fails the claim instead of leaving the device + * unblocked. WARN since such a failure indicates a bug. + */ + WARN_ON(ret); + return ret; +} + +static int hv_iommu_get_logical_device_property(struct device *dev, + u32 code, + struct hv_output_get_logical_device_property *property) +{ + u64 status; + u32 prefix; + unsigned long flags; + int ret; + struct pci_dev *pdev = to_pci_dev(dev); + struct hv_input_get_logical_device_property *input; + struct hv_output_get_logical_device_property *output; + + ret = hv_iommu_lookup_logical_dev_id(pci_domain_nr(pdev->bus), &prefix); + if (ret) + return ret; + + local_irq_save(flags); + + input = *this_cpu_ptr(hyperv_pcpu_input_arg); + output = (struct hv_output_get_logical_device_property *)(input + 1); + memset(input, 0, sizeof(*input)); + input->partition_id = HV_PARTITION_ID_SELF; + input->logical_device_id = (u64)prefix | PCI_FUNC(pdev->devfn); + input->code = code; + status = hv_do_hypercall(HVCALL_GET_LOGICAL_DEVICE_PROPERTY, input, output); + *property = *output; + + local_irq_restore(flags); + + if (!hv_result_success(status)) + pr_err("HVCALL_GET_LOGICAL_DEVICE_PROPERTY failed, status %lld\n", status); + + return hv_result_to_errno(status); +} + +static struct iommu_device *hv_iommu_probe_device(struct device *dev) +{ + struct pci_dev *pdev; + struct hv_iommu_endpoint *vdev; + struct hv_output_get_logical_device_property device_iommu_property = {0}; + + if (!dev_is_pci(dev)) + return ERR_PTR(-ENODEV); + + pdev = to_pci_dev(dev); + + if (hv_iommu_get_logical_device_property(dev, + HV_LOGICAL_DEVICE_PROPERTY_PVIOMMU, + &device_iommu_property) || + !(device_iommu_property.device_iommu & HV_DEVICE_IOMMU_ENABLED)) + return ERR_PTR(-ENODEV); + + vdev = kzalloc_obj(*vdev, GFP_KERNEL); + if (!vdev) + return ERR_PTR(-ENOMEM); + + vdev->dev = dev; + vdev->hv_iommu = hv_iommu_device; + dev_iommu_priv_set(dev, vdev); + + if (hv_iommu_ats_supported(hv_iommu_device->cap) && + pci_ats_supported(pdev)) + pci_enable_ats(pdev, __ffs(hv_iommu_device->pgsize_bitmap)); + + return &vdev->hv_iommu->iommu; +} + +static void hv_iommu_release_device(struct device *dev) +{ + struct hv_iommu_endpoint *vdev = dev_iommu_priv_get(dev); + struct pci_dev *pdev = to_pci_dev(dev); + + if (pdev->ats_enabled) + pci_disable_ats(pdev); + + dev_iommu_priv_set(dev, NULL); + + kfree(vdev); +} + +static struct iommu_group *hv_iommu_device_group(struct device *dev) +{ + if (dev_is_pci(dev)) + return pci_device_group(dev); + + WARN_ON_ONCE(1); + return generic_device_group(dev); +} + +static int hv_configure_device_domain(struct hv_iommu_domain *hv_domain, u32 domain_type) +{ + u64 status; + unsigned long flags; + struct pt_iommu_x86_64_hw_info pt_info; + struct hv_input_configure_device_domain *input; + + local_irq_save(flags); + + input = *this_cpu_ptr(hyperv_pcpu_input_arg); + memset(input, 0, sizeof(*input)); + input->device_domain = hv_domain->device_domain; + input->settings.flags.blocked = (domain_type == IOMMU_DOMAIN_BLOCKED); + /* + * Clearing translation_enabled bypasses translation (DMA uses the GPA + * directly), which only suits identity. The hypervisor requires paging + * and blocked domains to keep it set. + */ + input->settings.flags.translation_enabled = (domain_type != IOMMU_DOMAIN_IDENTITY); + + if (domain_type & __IOMMU_DOMAIN_PAGING) { + pt_iommu_x86_64_hw_info(&hv_domain->pt_iommu_x86_64, &pt_info); + input->settings.page_table_root = pt_info.gcr3_pt; + input->settings.flags.first_stage_paging_mode = + pt_info.levels == 5; + } + status = hv_do_hypercall(HVCALL_CONFIGURE_DEVICE_DOMAIN, input, NULL); + + local_irq_restore(flags); + + if (!hv_result_success(status)) + pr_err("HVCALL_CONFIGURE_DEVICE_DOMAIN failed, status %lld\n", status); + + return hv_result_to_errno(status); +} + +static int __init hv_initialize_static_domains(void) +{ + int ret; + struct hv_iommu_domain *hv_domain; + + /* Default stage-1 identity domain */ + hv_domain = &hv_identity_domain; + + ret = hv_create_device_domain(hv_domain, HV_DEVICE_DOMAIN_TYPE_S1); + if (ret) + return ret; + + ret = hv_configure_device_domain(hv_domain, IOMMU_DOMAIN_IDENTITY); + if (ret) + goto delete_identity_domain; + + /* Default stage-1 blocked domain */ + hv_domain = &hv_blocking_domain; + + ret = hv_create_device_domain(hv_domain, HV_DEVICE_DOMAIN_TYPE_S1); + if (ret) + goto delete_identity_domain; + + ret = hv_configure_device_domain(hv_domain, IOMMU_DOMAIN_BLOCKED); + if (ret) + goto delete_blocked_domain; + + return 0; + +delete_blocked_domain: + hv_delete_device_domain(&hv_blocking_domain); +delete_identity_domain: + hv_delete_device_domain(&hv_identity_domain); + return ret; +} + +/* x86 architectural MSI address range */ +#define INTERRUPT_RANGE_START (0xfee00000) +#define INTERRUPT_RANGE_END (0xfeefffff) +static void hv_iommu_get_resv_regions(struct device *dev, + struct list_head *head) +{ + struct iommu_resv_region *region; + + region = iommu_alloc_resv_region(INTERRUPT_RANGE_START, + INTERRUPT_RANGE_END - INTERRUPT_RANGE_START + 1, + 0, IOMMU_RESV_MSI, GFP_KERNEL); + if (!region) + return; + + list_add_tail(®ion->list, head); +} + +static void hv_iommu_flush_iotlb_all(struct iommu_domain *domain) +{ + hv_flush_device_domain(to_hv_iommu_domain(domain)); +} + +static void hv_iommu_iotlb_sync(struct iommu_domain *domain, + struct iommu_iotlb_gather *iotlb_gather) +{ + hv_flush_device_domain(to_hv_iommu_domain(domain)); + + iommu_put_pages_list(&iotlb_gather->freelist); +} + +static void hv_iommu_paging_domain_free(struct iommu_domain *domain) +{ + struct hv_iommu_domain *hv_domain = to_hv_iommu_domain(domain); + + /* Free all remaining mappings */ + pt_iommu_deinit(&hv_domain->pt_iommu); + + hv_delete_device_domain(hv_domain); + + kfree(hv_domain); +} + +static const struct iommu_domain_ops hv_iommu_identity_domain_ops = { + .attach_dev = hv_iommu_attach_dev, +}; + +static const struct iommu_domain_ops hv_iommu_blocking_domain_ops = { + .attach_dev = hv_iommu_blocking_attach_dev, +}; + +static const struct iommu_domain_ops hv_iommu_paging_domain_ops = { + .attach_dev = hv_iommu_attach_dev, + IOMMU_PT_DOMAIN_OPS(x86_64), + .flush_iotlb_all = hv_iommu_flush_iotlb_all, + .iotlb_sync = hv_iommu_iotlb_sync, + .free = hv_iommu_paging_domain_free, +}; + +static struct iommu_domain *hv_iommu_domain_alloc_paging(struct device *dev) +{ + int ret; + struct hv_iommu_domain *hv_domain; + struct pt_iommu_x86_64_cfg cfg = {}; + + hv_domain = kzalloc_obj(*hv_domain, GFP_KERNEL); + if (!hv_domain) + return ERR_PTR(-ENOMEM); + + ret = hv_create_device_domain(hv_domain, HV_DEVICE_DOMAIN_TYPE_S1); + if (ret) + goto err_free; + + hv_domain->pt_iommu.nid = dev_to_node(dev); + + cfg.common.hw_max_vasz_lg2 = hv_iommu_device->max_iova_width; + cfg.common.hw_max_oasz_lg2 = 52; + cfg.top_level = (hv_iommu_device->max_iova_width > 48) ? 4 : 3; + + ret = pt_iommu_x86_64_init(&hv_domain->pt_iommu_x86_64, &cfg, GFP_KERNEL); + if (ret) + goto err_delete_domain; + + /* Constrain to page sizes the hypervisor supports */ + hv_domain->domain.pgsize_bitmap &= hv_iommu_device->pgsize_bitmap; + + hv_domain->domain.ops = &hv_iommu_paging_domain_ops; + + ret = hv_configure_device_domain(hv_domain, __IOMMU_DOMAIN_PAGING); + if (ret) + goto err_pt_deinit; + + return &hv_domain->domain; + +err_pt_deinit: + pt_iommu_deinit(&hv_domain->pt_iommu); +err_delete_domain: + hv_delete_device_domain(hv_domain); +err_free: + kfree(hv_domain); + return ERR_PTR(ret); +} + +static struct iommu_ops hv_iommu_ops = { + .capable = hv_iommu_capable, + .domain_alloc_paging = hv_iommu_domain_alloc_paging, + .probe_device = hv_iommu_probe_device, + .release_device = hv_iommu_release_device, + .device_group = hv_iommu_device_group, + .get_resv_regions = hv_iommu_get_resv_regions, + .owner = THIS_MODULE, + .identity_domain = &hv_identity_domain.domain, + .blocked_domain = &hv_blocking_domain.domain, + .release_domain = &hv_blocking_domain.domain, +}; + +static int hv_iommu_detect(struct hv_output_get_iommu_capabilities *hv_iommu_cap) +{ + u64 status; + unsigned long flags; + struct hv_input_get_iommu_capabilities *input; + struct hv_output_get_iommu_capabilities *output; + + local_irq_save(flags); + + input = *this_cpu_ptr(hyperv_pcpu_input_arg); + output = (struct hv_output_get_iommu_capabilities *)(input + 1); + memset(input, 0, sizeof(*input)); + input->partition_id = HV_PARTITION_ID_SELF; + status = hv_do_hypercall(HVCALL_GET_IOMMU_CAPABILITIES, input, output); + *hv_iommu_cap = *output; + + local_irq_restore(flags); + + if (!hv_result_success(status)) + pr_err("HVCALL_GET_IOMMU_CAPABILITIES failed, status %lld\n", status); + + return hv_result_to_errno(status); +} + +static void __init hv_init_iommu_device(struct hv_iommu_dev *hv_iommu, + struct hv_output_get_iommu_capabilities *hv_iommu_cap) +{ + ida_init(&hv_iommu->domain_ids); + + hv_iommu->cap = hv_iommu_cap->iommu_cap; + hv_iommu->max_iova_width = hv_iommu_cap->max_iova_width; + if (!hv_iommu_5lvl_supported(hv_iommu->cap) && + hv_iommu->max_iova_width > 48) { + pr_info("5-level paging not supported, limiting iova width to 48.\n"); + hv_iommu->max_iova_width = 48; + } + + hv_iommu->geometry = (struct iommu_domain_geometry) { + .aperture_start = 0, + .aperture_end = (((u64)1) << hv_iommu->max_iova_width) - 1, + .force_aperture = true, + }; + + hv_iommu->first_domain = HV_DEVICE_DOMAIN_ID_DEFAULT + 1; + hv_iommu->last_domain = HV_DEVICE_DOMAIN_ID_NULL - 1; + hv_iommu->pgsize_bitmap = hv_iommu_cap->pgsize_bitmap; + hv_iommu_device = hv_iommu; +} + +int __init hv_iommu_init(void) +{ + int ret = 0; + struct hv_iommu_dev *hv_iommu = NULL; + struct hv_output_get_iommu_capabilities hv_iommu_cap = {0}; + + if (no_iommu || iommu_detected) + return -ENODEV; + + if (!hv_is_hyperv_initialized()) + return -ENODEV; + + ret = hv_iommu_detect(&hv_iommu_cap); + if (ret) { + pr_err("HVCALL_GET_IOMMU_CAPABILITIES failed: %d\n", ret); + return -ENODEV; + } + + if (!hv_iommu_present(hv_iommu_cap.iommu_cap) || + !hv_iommu_s1_domain_supported(hv_iommu_cap.iommu_cap)) { + pr_err("IOMMU capabilities not sufficient: cap=0x%llx\n", + hv_iommu_cap.iommu_cap); + return -ENODEV; + } + + /* + * The page table code only maps x86 page sizes (4K/2M/1G); require the + * hypervisor to advertise a non-empty subset of exactly those. + */ + if (!hv_iommu_cap.pgsize_bitmap || + (hv_iommu_cap.pgsize_bitmap & ~(u64)(SZ_4K | SZ_2M | SZ_1G))) { + pr_err("unsupported page sizes: pgsize_bitmap=0x%llx\n", + hv_iommu_cap.pgsize_bitmap); + return -ENODEV; + } + + iommu_detected = 1; + pci_request_acs(); + + hv_iommu = kzalloc_obj(*hv_iommu, GFP_KERNEL); + if (!hv_iommu) + return -ENOMEM; + + hv_init_iommu_device(hv_iommu, &hv_iommu_cap); + + ret = hv_initialize_static_domains(); + if (ret) { + pr_err("static domains init failed: %d\n", ret); + goto err_free; + } + + ret = iommu_device_sysfs_add(&hv_iommu->iommu, NULL, NULL, "%s", "hv-iommu"); + if (ret) { + pr_err("iommu_device_sysfs_add failed: %d\n", ret); + goto err_delete_static_domains; + } + + ret = iommu_device_register(&hv_iommu->iommu, &hv_iommu_ops, NULL); + if (ret) { + pr_err("iommu_device_register failed: %d\n", ret); + goto err_sysfs_remove; + } + + pr_info("successfully initialized\n"); + return 0; + +err_sysfs_remove: + iommu_device_sysfs_remove(&hv_iommu->iommu); +err_delete_static_domains: + hv_delete_device_domain(&hv_blocking_domain); + hv_delete_device_domain(&hv_identity_domain); +err_free: + kfree(hv_iommu); + return ret; +} diff --git a/drivers/iommu/hyperv/iommu.h b/drivers/iommu/hyperv/iommu.h new file mode 100644 index 000000000000..3a9f40fa2403 --- /dev/null +++ b/drivers/iommu/hyperv/iommu.h @@ -0,0 +1,51 @@ +/* SPDX-License-Identifier: GPL-2.0 */ + +/* + * Hyper-V IOMMU driver. + * + * Copyright (C) 2024-2025, Microsoft, Inc. + * + */ + +#ifndef _HYPERV_IOMMU_H +#define _HYPERV_IOMMU_H + +struct hv_iommu_dev { + struct iommu_device iommu; + struct ida domain_ids; + + /* Device configuration */ + u8 max_iova_width; + u8 max_pasid_width; + u64 cap; + u64 pgsize_bitmap; + + struct iommu_domain_geometry geometry; + u64 first_domain; + u64 last_domain; +}; + +struct hv_iommu_domain { + union { + struct iommu_domain domain; + struct pt_iommu pt_iommu; + struct pt_iommu_x86_64 pt_iommu_x86_64; + }; + struct hv_iommu_dev *hv_iommu; + struct hv_input_device_domain device_domain; + u64 pgsize_bitmap; +}; + +PT_IOMMU_CHECK_DOMAIN(struct hv_iommu_domain, pt_iommu, domain); +PT_IOMMU_CHECK_DOMAIN(struct hv_iommu_domain, pt_iommu_x86_64.iommu, domain); + +struct hv_iommu_endpoint { + struct device *dev; + struct hv_iommu_dev *hv_iommu; + struct hv_iommu_domain *hv_domain; +}; + +#define to_hv_iommu_domain(d) \ + container_of(d, struct hv_iommu_domain, domain) + +#endif /* _HYPERV_IOMMU_H */ -- 2.52.0

