[PATCH v4 21/22] trace/iommu: add sva trace events
Signed-off-by: Jacob Pan--- include/trace/events/iommu.h | 112 +++ 1 file changed, 112 insertions(+) diff --git a/include/trace/events/iommu.h b/include/trace/events/iommu.h index 72b4582..e64eb29 100644 --- a/include/trace/events/iommu.h +++ b/include/trace/events/iommu.h @@ -12,6 +12,8 @@ #define _TRACE_IOMMU_H #include +#include +#include struct device; @@ -161,6 +163,116 @@ DEFINE_EVENT(iommu_error, io_page_fault, TP_ARGS(dev, iova, flags) ); + +TRACE_EVENT(dev_fault, + + TP_PROTO(struct device *dev, struct iommu_fault_event *evt), + + TP_ARGS(dev, evt), + + TP_STRUCT__entry( + __string(device, dev_name(dev)) + __field(int, type) + __field(int, reason) + __field(u64, addr) + __field(u32, pasid) + __field(u32, pgid) + __field(u32, last_req) + __field(u32, prot) + ), + + TP_fast_assign( + __assign_str(device, dev_name(dev)); + __entry->type = evt->type; + __entry->reason = evt->reason; + __entry->addr = evt->addr; + __entry->pasid = evt->pasid; + __entry->pgid = evt->page_req_group_id; + __entry->last_req = evt->last_req; + __entry->prot = evt->prot; + ), + + TP_printk("IOMMU:%s type=%d reason=%d addr=0x%016llx pasid=%d group=%d last=%d prot=%d", + __get_str(device), + __entry->type, + __entry->reason, + __entry->addr, + __entry->pasid, + __entry->pgid, + __entry->last_req, + __entry->prot + ) +); + +TRACE_EVENT(dev_page_response, + + TP_PROTO(struct device *dev, struct page_response_msg *msg), + + TP_ARGS(dev, msg), + + TP_STRUCT__entry( + __string(device, dev_name(dev)) + __field(int, code) + __field(u64, addr) + __field(u32, pasid) + __field(u32, pgid) + ), + + TP_fast_assign( + __assign_str(device, dev_name(dev)); + __entry->code = msg->resp_code; + __entry->addr = msg->addr; + __entry->pasid = msg->pasid; + __entry->pgid = msg->page_req_group_id; + ), + + TP_printk("IOMMU:%s code=%d addr=0x%016llx pasid=%d group=%d", + __get_str(device), + __entry->code, + __entry->addr, + __entry->pasid, + __entry->pgid + ) +); + +TRACE_EVENT(sva_invalidate, + + TP_PROTO(struct device *dev, struct tlb_invalidate_info *ti), + + TP_ARGS(dev, ti), + + TP_STRUCT__entry( + __string(device, dev_name(dev)) + __field(int, type) + __field(u32, granu) + __field(u32, flags) + __field(u8, size) + __field(u32, pasid) + __field(u64, addr) + ), + + TP_fast_assign( + __assign_str(device, dev_name(dev)); + __entry->type = ti->hdr.type; + __entry->flags = ti->flags; + __entry->granu = ti->granularity; + __entry->size = ti->size; + __entry->pasid = ti->pasid; + __entry->addr = ti->addr; + ), + + TP_printk("IOMMU:%s type=%d flags=0x%08x granu=%d size=%d pasid=%d addr=0x%016llx", + __get_str(device), + __entry->type, + __entry->flags, + __entry->granu, + __entry->size, + __entry->pasid, + __entry->addr + ) +); + + #endif /* _TRACE_IOMMU_H */ /* This part must be outside protection */ -- 2.7.4 ___ iommu mailing list iommu@lists.linux-foundation.org https://lists.linuxfoundation.org/mailman/listinfo/iommu
[PATCH v4 18/22] iommu/intel-svm: replace dev ops with fault report API
With the introduction of generic IOMMU device fault reporting API, we can replace the private fault callback functions with standard function and event data. Signed-off-by: Jacob Pan--- drivers/iommu/intel-svm.c | 7 +-- include/linux/intel-svm.h | 20 +++- 2 files changed, 4 insertions(+), 23 deletions(-) diff --git a/drivers/iommu/intel-svm.c b/drivers/iommu/intel-svm.c index bdc702c..3a7bc77 100644 --- a/drivers/iommu/intel-svm.c +++ b/drivers/iommu/intel-svm.c @@ -299,7 +299,7 @@ static const struct mmu_notifier_ops intel_mmuops = { static DEFINE_MUTEX(pasid_mutex); -int intel_svm_bind_mm(struct device *dev, int *pasid, int flags, struct svm_dev_ops *ops) +int intel_svm_bind_mm(struct device *dev, int *pasid, int flags) { struct intel_iommu *iommu = intel_svm_device_to_iommu(dev); struct intel_svm_dev *sdev; @@ -346,10 +346,6 @@ int intel_svm_bind_mm(struct device *dev, int *pasid, int flags, struct svm_dev_ list_for_each_entry(sdev, >devs, list) { if (dev == sdev->dev) { - if (sdev->ops != ops) { - ret = -EBUSY; - goto out; - } sdev->users++; goto success; } @@ -375,7 +371,6 @@ int intel_svm_bind_mm(struct device *dev, int *pasid, int flags, struct svm_dev_ } /* Finish the setup now we know we're keeping it */ sdev->users = 1; - sdev->ops = ops; init_rcu_head(>rcu); if (!svm) { diff --git a/include/linux/intel-svm.h b/include/linux/intel-svm.h index 99bc5b3..a39a502 100644 --- a/include/linux/intel-svm.h +++ b/include/linux/intel-svm.h @@ -18,18 +18,6 @@ struct device; -struct svm_dev_ops { - void (*fault_cb)(struct device *dev, int pasid, u64 address, -u32 private, int rwxp, int response); -}; - -/* Values for rxwp in fault_cb callback */ -#define SVM_REQ_READ (1<<3) -#define SVM_REQ_WRITE (1<<2) -#define SVM_REQ_EXEC (1<<1) -#define SVM_REQ_PRIV (1<<0) - - /* * The SVM_FLAG_PRIVATE_PASID flag requests a PASID which is *not* the "main" * PASID for the current process. Even if a PASID already exists, a new one @@ -60,7 +48,6 @@ struct svm_dev_ops { * @dev: Device to be granted acccess * @pasid: Address for allocated PASID * @flags: Flags. Later for requesting supervisor mode, etc. - * @ops: Callbacks to device driver * * This function attempts to enable PASID support for the given device. * If the @pasid argument is non-%NULL, a PASID is allocated for access @@ -82,8 +69,7 @@ struct svm_dev_ops { * Multiple calls from the same process may result in the same PASID * being re-used. A reference count is kept. */ -extern int intel_svm_bind_mm(struct device *dev, int *pasid, int flags, -struct svm_dev_ops *ops); +extern int intel_svm_bind_mm(struct device *dev, int *pasid, int flags); /** * intel_svm_unbind_mm() - Unbind a specified PASID @@ -120,7 +106,7 @@ extern int intel_svm_is_pasid_valid(struct device *dev, int pasid); #else /* CONFIG_INTEL_IOMMU_SVM */ static inline int intel_svm_bind_mm(struct device *dev, int *pasid, - int flags, struct svm_dev_ops *ops) + int flags) { return -ENOSYS; } @@ -136,6 +122,6 @@ static int intel_svm_is_pasid_valid(struct device *dev, int pasid) } #endif /* CONFIG_INTEL_IOMMU_SVM */ -#define intel_svm_available(dev) (!intel_svm_bind_mm((dev), NULL, 0, NULL)) +#define intel_svm_available(dev) (!intel_svm_bind_mm((dev), NULL, 0)) #endif /* __INTEL_SVM_H__ */ -- 2.7.4 ___ iommu mailing list iommu@lists.linux-foundation.org https://lists.linuxfoundation.org/mailman/listinfo/iommu
[PATCH v4 15/22] iommu/config: add build dependency for dmar
Intel VT-d interrupts come from both IRQ remapping and DMA remapping. In order to report non-recoverable faults back to device driver, we need to have access to IOMMU fault reporting APIs. This patch adds build depenency to DMAR code where fault IRQ handlers can selectively report faults. Signed-off-by: Jacob Pan--- drivers/iommu/Kconfig | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/iommu/Kconfig b/drivers/iommu/Kconfig index f3a2134..45db4cc 100644 --- a/drivers/iommu/Kconfig +++ b/drivers/iommu/Kconfig @@ -138,6 +138,7 @@ config AMD_IOMMU_V2 # Intel IOMMU support config DMAR_TABLE bool + select IOMMU_API config INTEL_IOMMU bool "Support for Intel IOMMU using DMA Remapping Devices" -- 2.7.4 ___ iommu mailing list iommu@lists.linux-foundation.org https://lists.linuxfoundation.org/mailman/listinfo/iommu
[PATCH v4 10/22] iommu: introduce device fault data
Device faults detected by IOMMU can be reported outside IOMMU subsystem for further processing. This patch intends to provide a generic device fault data such that device drivers can be communicated with IOMMU faults without model specific knowledge. The proposed format is the result of discussion at: https://lkml.org/lkml/2017/11/10/291 Part of the code is based on Jean-Philippe Brucker's patchset (https://patchwork.kernel.org/patch/9989315/). The assumption is that model specific IOMMU driver can filter and handle most of the internal faults if the cause is within IOMMU driver control. Therefore, the fault reasons can be reported are grouped and generalized based common specifications such as PCI ATS. Signed-off-by: Jacob PanSigned-off-by: Jean-Philippe Brucker Signed-off-by: Liu, Yi L Signed-off-by: Ashok Raj --- include/linux/iommu.h | 102 +- 1 file changed, 100 insertions(+), 2 deletions(-) diff --git a/include/linux/iommu.h b/include/linux/iommu.h index da684a7..9eaa907 100644 --- a/include/linux/iommu.h +++ b/include/linux/iommu.h @@ -49,13 +49,17 @@ struct bus_type; struct device; struct iommu_domain; struct notifier_block; +struct iommu_fault_event; /* iommu fault flags */ -#define IOMMU_FAULT_READ 0x0 -#define IOMMU_FAULT_WRITE 0x1 +#define IOMMU_FAULT_READ (1 << 0) +#define IOMMU_FAULT_WRITE (1 << 1) +#define IOMMU_FAULT_EXEC (1 << 2) +#define IOMMU_FAULT_PRIV (1 << 3) typedef int (*iommu_fault_handler_t)(struct iommu_domain *, struct device *, unsigned long, int, void *); +typedef int (*iommu_dev_fault_handler_t)(struct iommu_fault_event *, void *); struct iommu_domain_geometry { dma_addr_t aperture_start; /* First address that can be mapped*/ @@ -264,6 +268,99 @@ struct iommu_device { struct device *dev; }; +/* Generic fault types, can be expanded IRQ remapping fault */ +enum iommu_fault_type { + IOMMU_FAULT_DMA_UNRECOV = 1,/* unrecoverable fault */ + IOMMU_FAULT_PAGE_REQ, /* page request fault */ +}; + +enum iommu_fault_reason { + IOMMU_FAULT_REASON_UNKNOWN = 0, + + /* IOMMU internal error, no specific reason to report out */ + IOMMU_FAULT_REASON_INTERNAL, + + /* Could not access the PASID table */ + IOMMU_FAULT_REASON_PASID_FETCH, + + /* +* PASID is out of range (e.g. exceeds the maximum PASID +* supported by the IOMMU) or disabled. +*/ + IOMMU_FAULT_REASON_PASID_INVALID, + + /* Could not access the page directory (Invalid PASID entry) */ + IOMMU_FAULT_REASON_PGD_FETCH, + + /* Could not access the page table entry (Bad address) */ + IOMMU_FAULT_REASON_PTE_FETCH, + + /* Protection flag check failed */ + IOMMU_FAULT_REASON_PERMISSION, +}; + +/** + * struct iommu_fault_event - Generic per device fault data + * + * - PCI and non-PCI devices + * - Recoverable faults (e.g. page request), information based on PCI ATS + * and PASID spec. + * - Un-recoverable faults of device interest + * - DMA remapping and IRQ remapping faults + + * @type contains fault type. + * @reason fault reasons if relevant outside IOMMU driver, IOMMU driver internal + * faults are not reported + * @addr: tells the offending page address + * @pasid: contains process address space ID, used in shared virtual memory(SVM) + * @rid: requestor ID + * @page_req_group_id: page request group index + * @last_req: last request in a page request group + * @pasid_valid: indicates if the PRQ has a valid PASID + * @prot: page access protection flag, e.g. IOMMU_FAULT_READ, IOMMU_FAULT_WRITE + * @device_private: if present, uniquely identify device-specific + * private data for an individual page request. + * @iommu_private: used by the IOMMU driver for storing fault-specific + * data. Users should not modify this field before + * sending the fault response. + */ +struct iommu_fault_event { + enum iommu_fault_type type; + enum iommu_fault_reason reason; + u64 addr; + u32 pasid; + u32 page_req_group_id; + u32 last_req : 1; + u32 pasid_valid : 1; + u32 prot; + u64 device_private; + u64 iommu_private; +}; + +/** + * struct iommu_fault_param - per-device IOMMU fault data + * @dev_fault_handler: Callback function to handle IOMMU faults at device level + * @data: handler private data + * + */ +struct iommu_fault_param { + iommu_dev_fault_handler_t handler; + void *data; +}; + +/** + * struct iommu_param - collection of per-device IOMMU data + * + * @fault_param: IOMMU detected device fault reporting data + * + * TODO: migrate other per device data pointers under iommu_dev_data, e.g. + * struct
[PATCH v4 13/22] iommu: introduce page response function
IO page faults can be handled outside IOMMU subsystem. For an example, when nested translation is turned on and guest owns the first level page tables, device page request can be forwared to the guest for handling faults. As the page response returns by the guest, IOMMU driver on the host need to process the response which informs the device and completes the page request transaction. This patch introduces generic API function for page response passing from the guest or other in-kernel users. The definitions of the generic data is based on PCI ATS specification not limited to any vendor. Signed-off-by: Jean-Philippe BruckerSigned-off-by: Jacob Pan Link: https://lkml.org/lkml/2017/12/7/1725 --- drivers/iommu/iommu.c | 45 include/linux/iommu.h | 52 +++ 2 files changed, 97 insertions(+) diff --git a/drivers/iommu/iommu.c b/drivers/iommu/iommu.c index cd653ff..42e8f00 100644 --- a/drivers/iommu/iommu.c +++ b/drivers/iommu/iommu.c @@ -1503,6 +1503,51 @@ int iommu_sva_invalidate(struct iommu_domain *domain, } EXPORT_SYMBOL_GPL(iommu_sva_invalidate); +int iommu_page_response(struct device *dev, + struct page_response_msg *msg) +{ + struct iommu_param *param = dev->iommu_param; + int ret = -EINVAL; + struct iommu_fault_event *evt, *iter; + struct iommu_domain *domain = iommu_get_domain_for_dev(dev); + + if (!domain || !domain->ops->page_response) + return -ENODEV; + + /* +* Device iommu_param should have been allocated when device is +* added to its iommu_group. +*/ + if (!param || !param->fault_param) + return -EINVAL; + + /* Only send response if there is a fault report pending */ + mutex_lock(>fault_param->lock); + if (list_empty(>fault_param->faults)) { + pr_warn("no pending PRQ, drop response\n"); + goto done_unlock; + } + /* +* Check if we have a matching page request pending to respond, +* otherwise return -EINVAL +*/ + list_for_each_entry_safe(evt, iter, >fault_param->faults, list) { + if (evt->pasid == msg->pasid && + msg->page_req_group_id == evt->page_req_group_id) { + msg->private_data = evt->iommu_private; + ret = domain->ops->page_response(dev, msg); + list_del(>list); + kfree(evt); + break; + } + } + +done_unlock: + mutex_unlock(>fault_param->lock); + return ret; +} +EXPORT_SYMBOL_GPL(iommu_page_response); + static void __iommu_detach_device(struct iommu_domain *domain, struct device *dev) { diff --git a/include/linux/iommu.h b/include/linux/iommu.h index b89e881..af0e393 100644 --- a/include/linux/iommu.h +++ b/include/linux/iommu.h @@ -163,6 +163,55 @@ struct iommu_resv_region { #ifdef CONFIG_IOMMU_API /** + * enum page_response_code - Return status of fault handlers, telling the IOMMU + * driver how to proceed with the fault. + * + * @IOMMU_FAULT_STATUS_SUCCESS: Fault has been handled and the page tables + * populated, retry the access. This is "Success" in PCI PRI. + * @IOMMU_FAULT_STATUS_FAILURE: General error. Drop all subsequent faults from + * this device if possible. This is "Response Failure" in PCI PRI. + * @IOMMU_FAULT_STATUS_INVALID: Could not handle this fault, don't retry the + * access. This is "Invalid Request" in PCI PRI. + */ +enum page_response_code { + IOMMU_PAGE_RESP_SUCCESS = 0, + IOMMU_PAGE_RESP_INVALID, + IOMMU_PAGE_RESP_FAILURE, +}; + +/** + * enum page_request_handle_t - Return page request/response handler status + * + * @IOMMU_FAULT_STATUS_HANDLED: Stop processing the fault, and do not send a + * reply to the device. + * @IOMMU_FAULT_STATUS_CONTINUE: Fault was not handled. Call the next handler, + * or terminate. + */ +enum page_request_handle_t { + IOMMU_PAGE_RESP_HANDLED = 0, + IOMMU_PAGE_RESP_CONTINUE, +}; + +/** + * Generic page response information based on PCI ATS and PASID spec. + * @addr: servicing page address + * @pasid: contains process address space ID + * @resp_code: response code + * @page_req_group_id: page request group index + * @type: group or stream/single page response + * @private_data: uniquely identify device-specific private data for an + *individual page response + */ +struct page_response_msg { + u64 addr; + u32 pasid; + enum page_response_code resp_code; + u32 pasid_present:1; + u32 page_req_group_id; + u64 private_data; +}; + +/** * struct iommu_ops - iommu ops and capabilities * @capable: check capability * @domain_alloc: allocate iommu domain @@ -195,6 +244,7
[PATCH v4 16/22] iommu/vt-d: report non-recoverable faults to device
Currently, dmar fault IRQ handler does nothing more than rate limited printk, no critical hardware handling need to be done in IRQ context. For some use case such as vIOMMU, it might be useful to report non-recoverable faults outside host IOMMU subsystem. DMAR fault can come from both DMA and interrupt remapping which has to be set up early before threaded IRQ is available. This patch adds an option and a workqueue such that when faults are requested, DMAR fault IRQ handler can use the IOMMU fault reporting API to report. Signed-off-by: Jacob PanSigned-off-by: Liu, Yi L Signed-off-by: Ashok Raj --- drivers/iommu/dmar.c| 157 ++-- drivers/iommu/intel-iommu.c | 6 +- include/linux/dmar.h| 2 +- include/linux/intel-iommu.h | 1 + 4 files changed, 157 insertions(+), 9 deletions(-) diff --git a/drivers/iommu/dmar.c b/drivers/iommu/dmar.c index 2ed4979..0f1abfc 100644 --- a/drivers/iommu/dmar.c +++ b/drivers/iommu/dmar.c @@ -1110,6 +1110,12 @@ static int alloc_iommu(struct dmar_drhd_unit *drhd) return err; } +static inline void dmar_free_fault_wq(struct intel_iommu *iommu) +{ + if (iommu->fault_wq) + destroy_workqueue(iommu->fault_wq); +} + static void free_iommu(struct intel_iommu *iommu) { if (intel_iommu_enabled) { @@ -1126,6 +1132,7 @@ static void free_iommu(struct intel_iommu *iommu) free_irq(iommu->irq, iommu); dmar_free_hwirq(iommu->irq); iommu->irq = 0; + dmar_free_fault_wq(iommu); } if (iommu->qi) { @@ -1554,6 +1561,31 @@ static const char *irq_remap_fault_reasons[] = "Blocked an interrupt request due to source-id verification failure", }; +/* fault data and status */ +enum intel_iommu_fault_reason { + INTEL_IOMMU_FAULT_REASON_SW, + INTEL_IOMMU_FAULT_REASON_ROOT_NOT_PRESENT, + INTEL_IOMMU_FAULT_REASON_CONTEXT_NOT_PRESENT, + INTEL_IOMMU_FAULT_REASON_CONTEXT_INVALID, + INTEL_IOMMU_FAULT_REASON_BEYOND_ADDR_WIDTH, + INTEL_IOMMU_FAULT_REASON_PTE_WRITE_ACCESS, + INTEL_IOMMU_FAULT_REASON_PTE_READ_ACCESS, + INTEL_IOMMU_FAULT_REASON_NEXT_PT_INVALID, + INTEL_IOMMU_FAULT_REASON_ROOT_ADDR_INVALID, + INTEL_IOMMU_FAULT_REASON_CONTEXT_PTR_INVALID, + INTEL_IOMMU_FAULT_REASON_NONE_ZERO_RTP, + INTEL_IOMMU_FAULT_REASON_NONE_ZERO_CTP, + INTEL_IOMMU_FAULT_REASON_NONE_ZERO_PTE, + NR_INTEL_IOMMU_FAULT_REASON, +}; + +/* fault reasons that are allowed to be reported outside IOMMU subsystem */ +#define INTEL_IOMMU_FAULT_REASON_ALLOWED \ + ((1ULL << INTEL_IOMMU_FAULT_REASON_BEYOND_ADDR_WIDTH) | \ + (1ULL << INTEL_IOMMU_FAULT_REASON_PTE_WRITE_ACCESS) | \ + (1ULL << INTEL_IOMMU_FAULT_REASON_PTE_READ_ACCESS)) + + static const char *dmar_get_fault_reason(u8 fault_reason, int *fault_type) { if (fault_reason >= 0x20 && (fault_reason - 0x20 < @@ -1634,11 +1666,90 @@ void dmar_msi_read(int irq, struct msi_msg *msg) raw_spin_unlock_irqrestore(>register_lock, flag); } +static enum iommu_fault_reason to_iommu_fault_reason(u8 reason) +{ + if (reason >= NR_INTEL_IOMMU_FAULT_REASON) { + pr_warn("unknown DMAR fault reason %d\n", reason); + return IOMMU_FAULT_REASON_UNKNOWN; + } + switch (reason) { + case INTEL_IOMMU_FAULT_REASON_SW: + case INTEL_IOMMU_FAULT_REASON_ROOT_NOT_PRESENT: + case INTEL_IOMMU_FAULT_REASON_CONTEXT_NOT_PRESENT: + case INTEL_IOMMU_FAULT_REASON_CONTEXT_INVALID: + case INTEL_IOMMU_FAULT_REASON_BEYOND_ADDR_WIDTH: + case INTEL_IOMMU_FAULT_REASON_ROOT_ADDR_INVALID: + case INTEL_IOMMU_FAULT_REASON_CONTEXT_PTR_INVALID: + return IOMMU_FAULT_REASON_INTERNAL; + case INTEL_IOMMU_FAULT_REASON_NEXT_PT_INVALID: + case INTEL_IOMMU_FAULT_REASON_PTE_WRITE_ACCESS: + case INTEL_IOMMU_FAULT_REASON_PTE_READ_ACCESS: + return IOMMU_FAULT_REASON_PERMISSION; + default: + return IOMMU_FAULT_REASON_UNKNOWN; + } +} + +struct dmar_fault_work { + struct work_struct fault_work; + u64 addr; + int type; + int fault_type; + enum intel_iommu_fault_reason reason; + u16 sid; +}; + +static void report_fault_to_device(struct work_struct *work) +{ + struct dmar_fault_work *dfw = container_of(work, struct dmar_fault_work, + fault_work); + struct iommu_fault_event event; + struct pci_dev *pdev; + u8 bus, devfn; + + memset(, 0, sizeof(struct iommu_fault_event)); + + /* check if fault reason is permitted to report outside IOMMU */ + if (!((1 << dfw->reason) & INTEL_IOMMU_FAULT_REASON_ALLOWED)) { + pr_debug("Fault reason %d not allowed to
[PATCH v4 14/22] iommu: handle page response timeout
When IO page faults are reported outside IOMMU subsystem, the page request handler may fail for various reasons. E.g. a guest received page requests but did not have a chance to run for a long time. The irresponsive behavior could hold off limited resources on the pending device. There can be hardware or credit based software solutions as suggested in the PCI ATS Ch-4. To provide a basic safty net this patch introduces a per device deferrable timer which monitors the longest pending page fault that requires a response. Proper action such as sending failure response code could be taken when timer expires but not included in this patch. We need to consider the life cycle of page groupd ID to prevent confusion with reused group ID by a device. For now, a warning message provides clue of such failure. Signed-off-by: Jacob PanSigned-off-by: Ashok Raj --- drivers/iommu/iommu.c | 60 +-- include/linux/iommu.h | 4 2 files changed, 62 insertions(+), 2 deletions(-) diff --git a/drivers/iommu/iommu.c b/drivers/iommu/iommu.c index 42e8f00..1e74944 100644 --- a/drivers/iommu/iommu.c +++ b/drivers/iommu/iommu.c @@ -799,6 +799,39 @@ int iommu_group_unregister_notifier(struct iommu_group *group, } EXPORT_SYMBOL_GPL(iommu_group_unregister_notifier); +/* Max time to wait for a pending page request */ +#define IOMMU_PAGE_RESPONSE_MAXTIME (HZ * 10) +static void iommu_dev_fault_timer_fn(struct timer_list *t) +{ + struct iommu_fault_param *fparam = from_timer(fparam, t, timer); + struct iommu_fault_event *evt, *iter; + + u64 now; + + now = get_jiffies_64(); + + /* The goal is to ensure driver or guest page fault handler(via vfio) +* send page response on time. Otherwise, limited queue resources +* may be occupied by some irresponsive guests or drivers. +* When per device pending fault list is not empty, we periodically checks +* if any anticipated page response time has expired. +* +* TODO: +* We could do the following if response time expires: +* 1. send page response code FAILURE to all pending PRQ +* 2. inform device driver or vfio +* 3. drain in-flight page requests and responses for this device +* 4. clear pending fault list such that driver can unregister fault +*handler(otherwise blocked when pending faults are present). +*/ + list_for_each_entry_safe(evt, iter, >faults, list) { + if (time_after64(evt->expire, now)) + pr_err("Page response time expired!, pasid %d gid %d exp %llu now %llu\n", + evt->pasid, evt->page_req_group_id, evt->expire, now); + } + mod_timer(t, now + IOMMU_PAGE_RESPONSE_MAXTIME); +} + /** * iommu_register_device_fault_handler() - Register a device fault handler * @dev: the device @@ -806,8 +839,8 @@ EXPORT_SYMBOL_GPL(iommu_group_unregister_notifier); * @data: private data passed as argument to the handler * * When an IOMMU fault event is received, call this handler with the fault event - * and data as argument. The handler should return 0. If the fault is - * recoverable (IOMMU_FAULT_PAGE_REQ), the handler must also complete + * and data as argument. The handler should return 0 on success. If the fault is + * recoverable (IOMMU_FAULT_PAGE_REQ), the handler can also complete * the fault by calling iommu_page_response() with one of the following * response code: * - IOMMU_PAGE_RESP_SUCCESS: retry the translation @@ -848,6 +881,9 @@ int iommu_register_device_fault_handler(struct device *dev, param->fault_param->data = data; INIT_LIST_HEAD(>fault_param->faults); + timer_setup(>fault_param->timer, iommu_dev_fault_timer_fn, + TIMER_DEFERRABLE); + mutex_unlock(>lock); return 0; @@ -905,6 +941,8 @@ int iommu_report_device_fault(struct device *dev, struct iommu_fault_event *evt) { int ret = 0; struct iommu_fault_event *evt_pending; + struct timer_list *tmr; + u64 exp; struct iommu_fault_param *fparam; /* iommu_param is allocated when device is added to group */ @@ -925,6 +963,17 @@ int iommu_report_device_fault(struct device *dev, struct iommu_fault_event *evt) goto done_unlock; } memcpy(evt_pending, evt, sizeof(struct iommu_fault_event)); + /* Keep track of response expiration time */ + exp = get_jiffies_64() + IOMMU_PAGE_RESPONSE_MAXTIME; + evt_pending->expire = exp; + + if (list_empty(>faults)) { + /* First pending event, start timer */ + tmr = >iommu_param->fault_param->timer; + WARN_ON(timer_pending(tmr)); + mod_timer(tmr, exp); + } +
[PATCH v4 17/22] iommu/intel-svm: report device page request
If the source device of a page request has its PASID table pointer bound to a guest, the first level page tables are owned by the guest. In this case, we shall let guest OS to manage page fault. This patch uses the IOMMU fault reporting API to send fault events, possibly via VFIO, to the guest OS. Once guest pages are fault in, guest will issue page response which will be passed down via the invalidation passdown APIs. Recoverable faults, such as page request reporting is not limitted to guest use. In kernel driver can also request a chance to receive fault notifications. Signed-off-by: Jacob PanSigned-off-by: Ashok Raj --- drivers/iommu/intel-svm.c | 71 --- include/linux/iommu.h | 1 + 2 files changed, 62 insertions(+), 10 deletions(-) diff --git a/drivers/iommu/intel-svm.c b/drivers/iommu/intel-svm.c index 99bc9bd..bdc702c 100644 --- a/drivers/iommu/intel-svm.c +++ b/drivers/iommu/intel-svm.c @@ -580,6 +580,56 @@ static bool is_canonical_address(u64 addr) return (((saddr << shift) >> shift) == saddr); } +static int prq_to_iommu_prot(struct page_req_dsc *req) +{ + int prot = 0; + + if (req->rd_req) + prot |= IOMMU_FAULT_READ; + if (req->wr_req) + prot |= IOMMU_FAULT_WRITE; + if (req->exe_req) + prot |= IOMMU_FAULT_EXEC; + if (req->priv_req) + prot |= IOMMU_FAULT_PRIV; + + return prot; +} + +static int intel_svm_prq_report(struct page_req_dsc *desc) +{ + int ret = 0; + struct iommu_fault_event event; + struct pci_dev *pdev; + + memset(, 0, sizeof(struct iommu_fault_event)); + pdev = pci_get_bus_and_slot(desc->bus, desc->devfn); + if (!pdev) { + pr_err("No PCI device found for PRQ [%02x:%02x.%d]\n", + desc->bus, PCI_SLOT(desc->devfn), + PCI_FUNC(desc->devfn)); + return -ENODEV; + } + + /* Fill in event data for device specific processing */ + event.type = IOMMU_FAULT_PAGE_REQ; + event.addr = (u64)desc->addr << VTD_PAGE_SHIFT; + event.pasid = desc->pasid; + event.page_req_group_id = desc->prg_index; + event.prot = prq_to_iommu_prot(desc); + event.last_req = desc->lpig; + event.pasid_valid = 1; + /* keep track of PRQ so that when the response comes back, we know +* whether we do group response or stream response. SRR[0] and +* private[54:32] bits in the descriptor are stored. +*/ + event.iommu_private = *(u64 *)desc; + ret = iommu_report_device_fault(>dev, ); + pci_dev_put(pdev); + + return ret; +} + static irqreturn_t prq_event_thread(int irq, void *d) { struct intel_iommu *iommu = d; @@ -628,6 +678,16 @@ static irqreturn_t prq_event_thread(int irq, void *d) goto no_pasid; } } + /* If address is not canonical, return invalid response */ + if (!is_canonical_address(address)) + goto bad_req; + + /* +* If prq is to be handled outside iommu driver via receiver of +* the fault notifiers, we skip the page response here. +*/ + if (!intel_svm_prq_report(req)) + goto prq_advance; result = QI_RESP_INVALID; /* Since we're using init_mm.pgd directly, we should never take @@ -638,9 +698,6 @@ static irqreturn_t prq_event_thread(int irq, void *d) if (!mmget_not_zero(svm->mm)) goto bad_req; - /* If address is not canonical, return invalid response */ - if (!is_canonical_address(address)) - goto bad_req; down_read(>mm->mmap_sem); vma = find_extend_vma(svm->mm, address); @@ -673,12 +730,6 @@ static irqreturn_t prq_event_thread(int irq, void *d) if (WARN_ON(>list == >devs)) sdev = NULL; - - if (sdev && sdev->ops && sdev->ops->fault_cb) { - int rwxp = (req->rd_req << 3) | (req->wr_req << 2) | - (req->exe_req << 1) | (req->priv_req); - sdev->ops->fault_cb(sdev->dev, req->pasid, req->addr, req->private, rwxp, result); - } /* We get here in the error case where the PASID lookup failed, and these can be NULL. Do not use them below this point! */ sdev = NULL; @@ -704,7 +755,7 @@ static irqreturn_t prq_event_thread(int irq, void *d) qi_submit_sync(, iommu); } - + prq_advance: head = (head + sizeof(*req)) & PRQ_RING_MASK; } diff --git a/include/linux/iommu.h
[PATCH v4 20/22] iommu/vt-d: add intel iommu page response function
This patch adds page response support for Intel VT-d. Generic response data is taken from the IOMMU API then parsed into VT-d specific response descriptor format. Signed-off-by: Jacob Pan--- drivers/iommu/intel-iommu.c | 47 + include/linux/intel-iommu.h | 3 +++ 2 files changed, 50 insertions(+) diff --git a/drivers/iommu/intel-iommu.c b/drivers/iommu/intel-iommu.c index 3229e20..8d73ff0 100644 --- a/drivers/iommu/intel-iommu.c +++ b/drivers/iommu/intel-iommu.c @@ -5195,6 +5195,52 @@ static int intel_iommu_sva_invalidate(struct iommu_domain *domain, return ret; } +int intel_iommu_page_response(struct device *dev, struct page_response_msg *msg) +{ + struct qi_desc resp; + struct intel_iommu *iommu; + struct pci_dev *pdev; + u8 bus, devfn; + u16 rid; + u64 desc; + + pdev = to_pci_dev(dev); + iommu = device_to_iommu(dev, , ); + if (!iommu) { + dev_err(dev, "No IOMMU for device to unbind PASID table\n"); + return -ENODEV; + } + + pci_dev_get(pdev); + rid = ((u16)bus << 8) | devfn; + /* Iommu private data contains preserved page request descriptor, so we +* inspect the SRR bit for response type then queue response with only +* the private data [54:32]. +*/ + desc = msg->private_data; + if (desc & QI_PRQ_SRR) { + /* Page Stream Response */ + resp.low = QI_PSTRM_IDX(msg->page_req_group_id) | + (desc & QI_PRQ_PRIV) | QI_PSTRM_BUS(PCI_BUS_NUM(pdev->bus->number)) | + QI_PSTRM_PASID(msg->pasid) | QI_PSTRM_RESP_TYPE; + resp.high = QI_PSTRM_ADDR(msg->addr) | QI_PSTRM_DEVFN(pdev->devfn & 0xff) | + QI_PSTRM_RESP_CODE(msg->resp_code); + } else { + /* Page Group Response */ + resp.low = QI_PGRP_PASID(msg->pasid) | + QI_PGRP_DID(rid) | + QI_PGRP_PASID_P(msg->pasid_present) | + QI_PGRP_RESP_TYPE; + resp.high = QI_PGRP_IDX(msg->page_req_group_id) | + (desc & QI_PRQ_PRIV) | QI_PGRP_RESP_CODE(msg->resp_code); + + } + qi_submit_sync(, iommu); + pci_dev_put(pdev); + + return 0; +} + static int intel_iommu_map(struct iommu_domain *domain, unsigned long iova, phys_addr_t hpa, size_t size, int iommu_prot) @@ -5625,6 +5671,7 @@ const struct iommu_ops intel_iommu_ops = { .bind_pasid_table = intel_iommu_bind_pasid_table, .unbind_pasid_table = intel_iommu_unbind_pasid_table, .sva_invalidate = intel_iommu_sva_invalidate, + .page_response = intel_iommu_page_response, #endif .map= intel_iommu_map, .unmap = intel_iommu_unmap, diff --git a/include/linux/intel-iommu.h b/include/linux/intel-iommu.h index dacb6cf..d2e1b5c 100644 --- a/include/linux/intel-iommu.h +++ b/include/linux/intel-iommu.h @@ -337,6 +337,9 @@ enum { #define QI_PSTRM_BUS(bus) (((u64)(bus)) << 24) #define QI_PSTRM_PASID(pasid) (((u64)(pasid)) << 4) +#define QI_PRQ_SRR BIT_ULL(0) +#define QI_PRQ_PRIVGENMASK_ULL(54, 32) + #define QI_RESP_SUCCESS0x0 #define QI_RESP_INVALID0x1 #define QI_RESP_FAILURE0xf -- 2.7.4 ___ iommu mailing list iommu@lists.linux-foundation.org https://lists.linuxfoundation.org/mailman/listinfo/iommu
[PATCH v4 09/22] iommu/vt-d: add svm/sva invalidate function
When Shared Virtual Address (SVA) is enabled for a guest OS via vIOMMU, we need to provide invalidation support at IOMMU API and driver level. This patch adds Intel VT-d specific function to implement iommu passdown invalidate API for shared virtual address. The use case is for supporting caching structure invalidation of assigned SVM capable devices. Emulated IOMMU exposes queue invalidation capability and passes down all descriptors from the guest to the physical IOMMU. The assumption is that guest to host device ID mapping should be resolved prior to calling IOMMU driver. Based on the device handle, host IOMMU driver can replace certain fields before submit to the invalidation queue. Signed-off-by: Liu, Yi LSigned-off-by: Ashok Raj Signed-off-by: Jacob Pan --- drivers/iommu/intel-iommu.c | 170 1 file changed, 170 insertions(+) diff --git a/drivers/iommu/intel-iommu.c b/drivers/iommu/intel-iommu.c index 53e9b7b..771ee1e 100644 --- a/drivers/iommu/intel-iommu.c +++ b/drivers/iommu/intel-iommu.c @@ -5026,6 +5026,175 @@ static void intel_iommu_detach_device(struct iommu_domain *domain, dmar_remove_one_dev_info(to_dmar_domain(domain), dev); } +/* + * 3D array for converting IOMMU generic type-granularity to VT-d granularity + * X indexed by enum iommu_inv_type + * Y indicates request without and with PASID + * Z indexed by enum iommu_inv_granularity + * + * For an example, if we want to find the VT-d granularity encoding for IOTLB + * type, DMA request with PASID, and page selective. The look up indices are: + * [1][1][8], where + * 1: IOMMU_INV_TYPE_TLB + * 1: with PASID + * 8: IOMMU_INV_GRANU_PAGE_PASID + * + * Granu_map array indicates validity of the table. 1: valid, 0: invalid + * + */ +const static int inv_type_granu_map[IOMMU_INV_NR_TYPE][2][IOMMU_INV_NR_GRANU] = { + /* extended dev IOTLBs, for dev-IOTLB, only global is valid, + for dev-EXIOTLB, two valid granu */ + { + {1}, + {0, 0, 0, 0, 1, 1, 0, 0, 0} + }, + /* IOTLB and EIOTLB */ + { + {1, 1, 0, 1, 0, 0, 0, 0, 0}, + {0, 0, 0, 0, 1, 0, 1, 1, 1} + }, + /* PASID cache */ + { + {0}, + {0, 0, 0, 0, 1, 1, 0, 0, 0} + }, + /* context cache */ + { + {1, 1, 1} + } +}; + +const static u64 inv_type_granu_table[IOMMU_INV_NR_TYPE][2][IOMMU_INV_NR_GRANU] = { + /* extended dev IOTLBs, only global is valid */ + { + {QI_DEV_IOTLB_GRAN_ALL}, + {0, 0, 0, 0, QI_DEV_IOTLB_GRAN_ALL, QI_DEV_IOTLB_GRAN_PASID_SEL, 0, 0, 0} + }, + /* IOTLB and EIOTLB */ + { + {DMA_TLB_GLOBAL_FLUSH, DMA_TLB_DSI_FLUSH, 0, DMA_TLB_PSI_FLUSH}, + {0, 0, 0, 0, QI_GRAN_ALL_ALL, 0, QI_GRAN_NONG_ALL, QI_GRAN_NONG_PASID, QI_GRAN_PSI_PASID} + }, + /* PASID cache */ + { + {0}, + {0, 0, 0, 0, QI_PC_ALL_PASIDS, QI_PC_PASID_SEL} + }, + /* context cache */ + { + {DMA_CCMD_GLOBAL_INVL, DMA_CCMD_DOMAIN_INVL, DMA_CCMD_DEVICE_INVL} + } +}; + +static inline int to_vtd_granularity(int type, int granu, int with_pasid, u64 *vtd_granu) +{ + if (type >= IOMMU_INV_NR_TYPE || granu >= IOMMU_INV_NR_GRANU || with_pasid > 1) + return -EINVAL; + + if (inv_type_granu_map[type][with_pasid][granu] == 0) + return -EINVAL; + + *vtd_granu = inv_type_granu_table[type][with_pasid][granu]; + + return 0; +} + +static int intel_iommu_sva_invalidate(struct iommu_domain *domain, + struct device *dev, struct tlb_invalidate_info *inv_info) +{ + struct intel_iommu *iommu; + struct dmar_domain *dmar_domain = to_dmar_domain(domain); + struct device_domain_info *info; + u16 did, sid; + u8 bus, devfn; + int ret = 0; + u64 granu; + unsigned long flags; + + if (!inv_info || !dmar_domain) + return -EINVAL; + + iommu = device_to_iommu(dev, , ); + if (!iommu) + return -ENODEV; + + if (!dev || !dev_is_pci(dev)) + return -ENODEV; + + did = dmar_domain->iommu_did[iommu->seq_id]; + sid = PCI_DEVID(bus, devfn); + ret = to_vtd_granularity(inv_info->hdr.type, inv_info->granularity, + !!(inv_info->flags & IOMMU_INVALIDATE_PASID_TAGGED), ); + if (ret) { + pr_err("Invalid range type %d, granu %d\n", inv_info->hdr.type, + inv_info->granularity); + return ret; + } + + spin_lock(>lock); + spin_lock_irqsave(_domain_lock, flags); + + switch (inv_info->hdr.type) { + case IOMMU_INV_TYPE_CONTEXT: + iommu->flush.flush_context(iommu, did, sid, +
[PATCH v4 19/22] iommu/intel-svm: do not flush iotlb for viommu
vIOMMU passdown invalidation will be inclusive, PASID cache invalidation includes TLBs. See Intel VT-d Specification Ch 6.5.2.2 for details. Signed-off-by: Jacob Pan--- drivers/iommu/intel-svm.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/drivers/iommu/intel-svm.c b/drivers/iommu/intel-svm.c index 3a7bc77..66b8e10 100644 --- a/drivers/iommu/intel-svm.c +++ b/drivers/iommu/intel-svm.c @@ -284,7 +284,9 @@ static void intel_mm_release(struct mmu_notifier *mn, struct mm_struct *mm) rcu_read_lock(); list_for_each_entry_rcu(sdev, >devs, list) { intel_flush_pasid_dev(svm, sdev, svm->pasid); - intel_flush_svm_range_dev(svm, sdev, 0, -1, 0, !svm->mm); + /* for emulated iommu, PASID cache invalidation implies IOTLB/DTLB */ + if (!cap_caching_mode(svm->iommu->cap)) + intel_flush_svm_range_dev(svm, sdev, 0, -1, 0, !svm->mm); } rcu_read_unlock(); -- 2.7.4 ___ iommu mailing list iommu@lists.linux-foundation.org https://lists.linuxfoundation.org/mailman/listinfo/iommu
[PATCH v4 22/22] iommu: use sva invalidate and device fault trace event
For performance and debugging purposes, these trace events help analyzing device faults and passdown invalidations that interact with IOMMU subsystem. E.g. IOMMU::00:0a.0 type=2 reason=0 addr=0x007ff000 pasid=1 group=1 last=0 prot=1 Signed-off-by: Jacob Pan--- drivers/iommu/iommu.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/drivers/iommu/iommu.c b/drivers/iommu/iommu.c index 1e74944..e5fc345 100644 --- a/drivers/iommu/iommu.c +++ b/drivers/iommu/iommu.c @@ -979,6 +979,7 @@ int iommu_report_device_fault(struct device *dev, struct iommu_fault_event *evt) mutex_unlock(>lock); } ret = fparam->handler(evt, fparam->data); + trace_dev_fault(dev, evt); done_unlock: mutex_unlock(>iommu_param->lock); return ret; @@ -1547,6 +1548,7 @@ int iommu_sva_invalidate(struct iommu_domain *domain, return -ENODEV; ret = domain->ops->sva_invalidate(domain, dev, inv_info); + trace_sva_invalidate(dev, inv_info); return ret; } @@ -1584,6 +1586,7 @@ int iommu_page_response(struct device *dev, if (evt->pasid == msg->pasid && msg->page_req_group_id == evt->page_req_group_id) { msg->private_data = evt->iommu_private; + trace_dev_page_response(dev, msg); ret = domain->ops->page_response(dev, msg); list_del(>list); kfree(evt); -- 2.7.4 ___ iommu mailing list iommu@lists.linux-foundation.org https://lists.linuxfoundation.org/mailman/listinfo/iommu
[PATCH v4 11/22] driver core: add per device iommu param
DMA faults can be detected by IOMMU at device level. Adding a pointer to struct device allows IOMMU subsystem to report relevant faults back to the device driver for further handling. For direct assigned device (or user space drivers), guest OS holds responsibility to handle and respond per device IOMMU fault. Therefore we need fault reporting mechanism to propagate faults beyond IOMMU subsystem. There are two other IOMMU data pointers under struct device today, here we introduce iommu_param as a parent pointer such that all device IOMMU data can be consolidated here. The idea was suggested here by Greg KH and Joerg. The name iommu_param is chosen here since iommu_data has been used. Suggested-by: Greg Kroah-HartmanSigned-off-by: Jacob Pan Signed-off-by: Jean-Philippe Brucker Link: https://lkml.org/lkml/2017/10/6/81 --- include/linux/device.h | 3 +++ 1 file changed, 3 insertions(+) diff --git a/include/linux/device.h b/include/linux/device.h index b093405..35d9f34 100644 --- a/include/linux/device.h +++ b/include/linux/device.h @@ -41,6 +41,7 @@ struct iommu_ops; struct iommu_group; struct iommu_fwspec; struct dev_pin_info; +struct iommu_param; struct bus_attribute { struct attributeattr; @@ -872,6 +873,7 @@ struct dev_links_info { * device (i.e. the bus driver that discovered the device). * @iommu_group: IOMMU group the device belongs to. * @iommu_fwspec: IOMMU-specific properties supplied by firmware. + * @iommu_param: Per device generic IOMMU runtime data * * @offline_disabled: If set, the device is permanently online. * @offline: Set after successful invocation of bus type's .offline(). @@ -961,6 +963,7 @@ struct device { void(*release)(struct device *dev); struct iommu_group *iommu_group; struct iommu_fwspec *iommu_fwspec; + struct iommu_param *iommu_param; booloffline_disabled:1; booloffline:1; -- 2.7.4 ___ iommu mailing list iommu@lists.linux-foundation.org https://lists.linuxfoundation.org/mailman/listinfo/iommu
[PATCH v4 12/22] iommu: introduce device fault report API
Traditionally, device specific faults are detected and handled within their own device drivers. When IOMMU is enabled, faults such as DMA related transactions are detected by IOMMU. There is no generic reporting mechanism to report faults back to the in-kernel device driver or the guest OS in case of assigned devices. Faults detected by IOMMU is based on the transaction's source ID which can be reported at per device basis, regardless of the device type is a PCI device or not. The fault types include recoverable (e.g. page request) and unrecoverable faults(e.g. access error). In most cases, faults can be handled by IOMMU drivers internally. The primary use cases are as follows: 1. page request fault originated from an SVM capable device that is assigned to guest via vIOMMU. In this case, the first level page tables are owned by the guest. Page request must be propagated to the guest to let guest OS fault in the pages then send page response. In this mechanism, the direct receiver of IOMMU fault notification is VFIO, which can relay notification events to QEMU or other user space software. 2. faults need more subtle handling by device drivers. Other than simply invoke reset function, there are needs to let device driver handle the fault with a smaller impact. This patchset is intended to create a generic fault report API such that it can scale as follows: - all IOMMU types - PCI and non-PCI devices - recoverable and unrecoverable faults - VFIO and other other in kernel users - DMA & IRQ remapping (TBD) The original idea was brought up by David Woodhouse and discussions summarized at https://lwn.net/Articles/608914/. Signed-off-by: Jacob PanSigned-off-by: Ashok Raj Signed-off-by: Jean-Philippe Brucker --- drivers/iommu/iommu.c | 147 +- include/linux/iommu.h | 35 +++- 2 files changed, 179 insertions(+), 3 deletions(-) diff --git a/drivers/iommu/iommu.c b/drivers/iommu/iommu.c index 466f789..cd653ff 100644 --- a/drivers/iommu/iommu.c +++ b/drivers/iommu/iommu.c @@ -581,6 +581,13 @@ int iommu_group_add_device(struct iommu_group *group, struct device *dev) goto err_free_name; } + dev->iommu_param = kzalloc(sizeof(*dev->iommu_param), GFP_KERNEL); + if (!dev->iommu_param) { + ret = -ENOMEM; + goto err_free_name; + } + mutex_init(>iommu_param->lock); + kobject_get(group->devices_kobj); dev->iommu_group = group; @@ -611,6 +618,7 @@ int iommu_group_add_device(struct iommu_group *group, struct device *dev) mutex_unlock(>mutex); dev->iommu_group = NULL; kobject_put(group->devices_kobj); + kfree(dev->iommu_param); err_free_name: kfree(device->name); err_remove_link: @@ -657,7 +665,7 @@ void iommu_group_remove_device(struct device *dev) sysfs_remove_link(>kobj, "iommu_group"); trace_remove_device_from_group(group->id, dev); - + kfree(dev->iommu_param); kfree(device->name); kfree(device); dev->iommu_group = NULL; @@ -792,6 +800,143 @@ int iommu_group_unregister_notifier(struct iommu_group *group, EXPORT_SYMBOL_GPL(iommu_group_unregister_notifier); /** + * iommu_register_device_fault_handler() - Register a device fault handler + * @dev: the device + * @handler: the fault handler + * @data: private data passed as argument to the handler + * + * When an IOMMU fault event is received, call this handler with the fault event + * and data as argument. The handler should return 0. If the fault is + * recoverable (IOMMU_FAULT_PAGE_REQ), the handler must also complete + * the fault by calling iommu_page_response() with one of the following + * response code: + * - IOMMU_PAGE_RESP_SUCCESS: retry the translation + * - IOMMU_PAGE_RESP_INVALID: terminate the fault + * - IOMMU_PAGE_RESP_FAILURE: terminate the fault and stop reporting + * page faults if possible. + * + * Return 0 if the fault handler was installed successfully, or an error. + */ +int iommu_register_device_fault_handler(struct device *dev, + iommu_dev_fault_handler_t handler, + void *data) +{ + struct iommu_param *param = dev->iommu_param; + + /* +* Device iommu_param should have been allocated when device is +* added to its iommu_group. +*/ + if (!param) + return -EINVAL; + + /* Only allow one fault handler registered for each device */ + if (param->fault_param) + return -EBUSY; + + mutex_lock(>lock); + get_device(dev); + param->fault_param = + kzalloc(sizeof(struct iommu_fault_param), GFP_ATOMIC); + if (!param->fault_param) { + put_device(dev); + mutex_unlock(>lock); + return -ENOMEM; + }
[PATCH v4 07/22] iommu/vt-d: fix dev iotlb pfsid use
PFSID should be used in the invalidation descriptor for flushing device IOTLBs on SRIOV VFs. Signed-off-by: Jacob Pan--- drivers/iommu/dmar.c| 6 +++--- drivers/iommu/intel-iommu.c | 16 +++- include/linux/intel-iommu.h | 5 ++--- 3 files changed, 20 insertions(+), 7 deletions(-) diff --git a/drivers/iommu/dmar.c b/drivers/iommu/dmar.c index 9a7ffd1..78f7e70 100644 --- a/drivers/iommu/dmar.c +++ b/drivers/iommu/dmar.c @@ -1339,8 +1339,8 @@ void qi_flush_iotlb(struct intel_iommu *iommu, u16 did, u64 addr, qi_submit_sync(, iommu); } -void qi_flush_dev_iotlb(struct intel_iommu *iommu, u16 sid, u16 qdep, - u64 addr, unsigned mask) +void qi_flush_dev_iotlb(struct intel_iommu *iommu, u16 sid, u16 pfsid, + u16 qdep, u64 addr, unsigned mask) { struct qi_desc desc; @@ -1355,7 +1355,7 @@ void qi_flush_dev_iotlb(struct intel_iommu *iommu, u16 sid, u16 qdep, qdep = 0; desc.low = QI_DEV_IOTLB_SID(sid) | QI_DEV_IOTLB_QDEP(qdep) | - QI_DIOTLB_TYPE; + QI_DIOTLB_TYPE | QI_DEV_IOTLB_PFSID(pfsid); qi_submit_sync(, iommu); } diff --git a/drivers/iommu/intel-iommu.c b/drivers/iommu/intel-iommu.c index 8667727..53e9b7b 100644 --- a/drivers/iommu/intel-iommu.c +++ b/drivers/iommu/intel-iommu.c @@ -1482,6 +1482,19 @@ static void iommu_enable_dev_iotlb(struct device_domain_info *info) return; pdev = to_pci_dev(info->dev); + /* For IOMMU that supports device IOTLB throttling (DIT), we assign +* PFSID to the invalidation desc of a VF such that IOMMU HW can gauge +* queue depth at PF level. If DIT is not set, PFSID will be treated as +* reserved, which should be set to 0. +*/ + if (!ecap_dit(info->iommu->ecap)) + info->pfsid = 0; + else if (pdev && pdev->is_virtfn) { + if (ecap_dit(info->iommu->ecap)) + dev_warn(>dev, "SRIOV VF device IOTLB enabled without flow control\n"); + info->pfsid = PCI_DEVID(pdev->physfn->bus->number, pdev->physfn->devfn); + } else + info->pfsid = PCI_DEVID(info->bus, info->devfn); #ifdef CONFIG_INTEL_IOMMU_SVM /* The PCIe spec, in its wisdom, declares that the behaviour of @@ -1547,7 +1560,8 @@ static void iommu_flush_dev_iotlb(struct dmar_domain *domain, sid = info->bus << 8 | info->devfn; qdep = info->ats_qdep; - qi_flush_dev_iotlb(info->iommu, sid, qdep, addr, mask); + qi_flush_dev_iotlb(info->iommu, sid, info->pfsid, + qdep, addr, mask); } spin_unlock_irqrestore(_domain_lock, flags); } diff --git a/include/linux/intel-iommu.h b/include/linux/intel-iommu.h index 0e3b618..1c9375b 100644 --- a/include/linux/intel-iommu.h +++ b/include/linux/intel-iommu.h @@ -477,9 +477,8 @@ extern void qi_flush_context(struct intel_iommu *iommu, u16 did, u16 sid, u8 fm, u64 type); extern void qi_flush_iotlb(struct intel_iommu *iommu, u16 did, u64 addr, unsigned int size_order, u64 type); -extern void qi_flush_dev_iotlb(struct intel_iommu *iommu, u16 sid, u16 qdep, - u64 addr, unsigned mask); - +extern void qi_flush_dev_iotlb(struct intel_iommu *iommu, u16 sid, u16 pfsid, + u16 qdep, u64 addr, unsigned mask); extern int qi_submit_sync(struct qi_desc *desc, struct intel_iommu *iommu); extern int dmar_ir_support(void); -- 2.7.4 ___ iommu mailing list iommu@lists.linux-foundation.org https://lists.linuxfoundation.org/mailman/listinfo/iommu
[PATCH v4 08/22] iommu/vt-d: support flushing more translation cache types
When Shared Virtual Memory is exposed to a guest via vIOMMU, extended IOTLB invalidation may be passed down from outside IOMMU subsystems. This patch adds invalidation functions that can be used for additional translation cache types. Signed-off-by: Jacob Pan--- drivers/iommu/dmar.c| 44 include/linux/intel-iommu.h | 21 +++-- 2 files changed, 63 insertions(+), 2 deletions(-) diff --git a/drivers/iommu/dmar.c b/drivers/iommu/dmar.c index 78f7e70..2ed4979 100644 --- a/drivers/iommu/dmar.c +++ b/drivers/iommu/dmar.c @@ -1339,6 +1339,18 @@ void qi_flush_iotlb(struct intel_iommu *iommu, u16 did, u64 addr, qi_submit_sync(, iommu); } +void qi_flush_eiotlb(struct intel_iommu *iommu, u16 did, u64 addr, u32 pasid, + unsigned int size_order, u64 granu, bool global) +{ + struct qi_desc desc; + + desc.low = QI_EIOTLB_PASID(pasid) | QI_EIOTLB_DID(did) | + QI_EIOTLB_GRAN(granu) | QI_EIOTLB_TYPE; + desc.high = QI_EIOTLB_ADDR(addr) | QI_EIOTLB_GL(global) | + QI_EIOTLB_IH(0) | QI_EIOTLB_AM(size_order); + qi_submit_sync(, iommu); +} + void qi_flush_dev_iotlb(struct intel_iommu *iommu, u16 sid, u16 pfsid, u16 qdep, u64 addr, unsigned mask) { @@ -1360,6 +1372,38 @@ void qi_flush_dev_iotlb(struct intel_iommu *iommu, u16 sid, u16 pfsid, qi_submit_sync(, iommu); } +void qi_flush_dev_eiotlb(struct intel_iommu *iommu, u16 sid, + u32 pasid, u16 qdep, u64 addr, unsigned size, u64 granu) +{ + struct qi_desc desc; + + desc.low = QI_DEV_EIOTLB_PASID(pasid) | QI_DEV_EIOTLB_SID(sid) | + QI_DEV_EIOTLB_QDEP(qdep) | QI_DEIOTLB_TYPE; + desc.high |= QI_DEV_EIOTLB_GLOB(granu); + + /* If S bit is 0, we only flush a single page. If S bit is set, +* The least significant zero bit indicates the size. VT-d spec +* 6.5.2.6 +*/ + if (!size) + desc.high = QI_DEV_EIOTLB_ADDR(addr) & ~QI_DEV_EIOTLB_SIZE; + else { + unsigned long mask = 1UL << (VTD_PAGE_SHIFT + size); + + desc.high = QI_DEV_EIOTLB_ADDR(addr & ~mask) | QI_DEV_EIOTLB_SIZE; + } + qi_submit_sync(, iommu); +} + +void qi_flush_pasid(struct intel_iommu *iommu, u16 did, u64 granu, int pasid) +{ + struct qi_desc desc; + + desc.high = 0; + desc.low = QI_PC_TYPE | QI_PC_DID(did) | QI_PC_GRAN(granu) | QI_PC_PASID(pasid); + + qi_submit_sync(, iommu); +} /* * Disable Queued Invalidation interface. */ diff --git a/include/linux/intel-iommu.h b/include/linux/intel-iommu.h index 1c9375b..245ac7e 100644 --- a/include/linux/intel-iommu.h +++ b/include/linux/intel-iommu.h @@ -262,6 +262,10 @@ enum { #define QI_PGRP_RESP_TYPE 0x9 #define QI_PSTRM_RESP_TYPE 0xa +#define QI_DID(did)(((u64)did & 0x) << 16) +#define QI_DID_MASKGENMASK(31, 16) +#define QI_TYPE_MASK GENMASK(3, 0) + #define QI_IEC_SELECTIVE (((u64)1) << 4) #define QI_IEC_IIDEX(idx) (((u64)(idx & 0x) << 32)) #define QI_IEC_IM(m) (((u64)(m & 0x1f) << 27)) @@ -293,8 +297,9 @@ enum { #define QI_PC_DID(did) (((u64)did) << 16) #define QI_PC_GRAN(gran) (((u64)gran) << 4) -#define QI_PC_ALL_PASIDS (QI_PC_TYPE | QI_PC_GRAN(0)) -#define QI_PC_PASID_SEL(QI_PC_TYPE | QI_PC_GRAN(1)) +/* PASID cache invalidation granu */ +#define QI_PC_ALL_PASIDS 0 +#define QI_PC_PASID_SEL1 #define QI_EIOTLB_ADDR(addr) ((u64)(addr) & VTD_PAGE_MASK) #define QI_EIOTLB_GL(gl) (((u64)gl) << 7) @@ -304,6 +309,10 @@ enum { #define QI_EIOTLB_DID(did) (((u64)did) << 16) #define QI_EIOTLB_GRAN(gran) (((u64)gran) << 4) +/* QI Dev-IOTLB inv granu */ +#define QI_DEV_IOTLB_GRAN_ALL 0 +#define QI_DEV_IOTLB_GRAN_PASID_SEL1 + #define QI_DEV_EIOTLB_ADDR(a) ((u64)(a) & VTD_PAGE_MASK) #define QI_DEV_EIOTLB_SIZE (((u64)1) << 11) #define QI_DEV_EIOTLB_GLOB(g) ((u64)g) @@ -332,6 +341,7 @@ enum { #define QI_RESP_INVALID0x1 #define QI_RESP_FAILURE0xf +/* QI EIOTLB inv granu */ #define QI_GRAN_ALL_ALL0 #define QI_GRAN_NONG_ALL 1 #define QI_GRAN_NONG_PASID 2 @@ -477,8 +487,15 @@ extern void qi_flush_context(struct intel_iommu *iommu, u16 did, u16 sid, u8 fm, u64 type); extern void qi_flush_iotlb(struct intel_iommu *iommu, u16 did, u64 addr, unsigned int size_order, u64 type); +extern void qi_flush_eiotlb(struct intel_iommu *iommu, u16 did, u64 addr, + u32 pasid, unsigned int size_order, u64 type, bool global); extern void qi_flush_dev_iotlb(struct intel_iommu *iommu, u16 sid, u16 pfsid, u16 qdep, u64 addr, unsigned mask); + +extern void
[PATCH v4 02/22] iommu/vt-d: move device_domain_info to header
Allow both intel-iommu.c and dmar.c to access device_domain_info. Prepare for additional per device arch data used in TLB flush function Signed-off-by: Jacob Pan--- drivers/iommu/intel-iommu.c | 18 -- include/linux/intel-iommu.h | 19 +++ 2 files changed, 19 insertions(+), 18 deletions(-) diff --git a/drivers/iommu/intel-iommu.c b/drivers/iommu/intel-iommu.c index 582fd01..15fa30d 100644 --- a/drivers/iommu/intel-iommu.c +++ b/drivers/iommu/intel-iommu.c @@ -414,24 +414,6 @@ struct dmar_domain { iommu core */ }; -/* PCI domain-device relationship */ -struct device_domain_info { - struct list_head link; /* link to domain siblings */ - struct list_head global; /* link to global list */ - u8 bus; /* PCI bus number */ - u8 devfn; /* PCI devfn number */ - u8 pasid_supported:3; - u8 pasid_enabled:1; - u8 pri_supported:1; - u8 pri_enabled:1; - u8 ats_supported:1; - u8 ats_enabled:1; - u8 ats_qdep; - struct device *dev; /* it's NULL for PCIe-to-PCI bridge */ - struct intel_iommu *iommu; /* IOMMU used by this device */ - struct dmar_domain *domain; /* pointer to domain */ -}; - struct dmar_rmrr_unit { struct list_head list; /* list of rmrr units */ struct acpi_dmar_header *hdr; /* ACPI header */ diff --git a/include/linux/intel-iommu.h b/include/linux/intel-iommu.h index 8dad3dd..13b44bb 100644 --- a/include/linux/intel-iommu.h +++ b/include/linux/intel-iommu.h @@ -434,6 +434,25 @@ struct intel_iommu { u32 flags; /* Software defined flags */ }; +/* PCI domain-device relationship */ +struct device_domain_info { + struct list_head link; /* link to domain siblings */ + struct list_head global; /* link to global list */ + u8 bus; /* PCI bus number */ + u8 devfn; /* PCI devfn number */ + u8 pasid_supported:3; + u8 pasid_enabled:1; + u8 pri_supported:1; + u8 pri_enabled:1; + u8 ats_supported:1; + u8 ats_enabled:1; + u8 ats_qdep; + u64 fault_mask; /* selected IOMMU faults to be reported */ + struct device *dev; /* it's NULL for PCIe-to-PCI bridge */ + struct intel_iommu *iommu; /* IOMMU used by this device */ + struct dmar_domain *domain; /* pointer to domain */ +}; + static inline void __iommu_flush_cache( struct intel_iommu *iommu, void *addr, int size) { -- 2.7.4 ___ iommu mailing list iommu@lists.linux-foundation.org https://lists.linuxfoundation.org/mailman/listinfo/iommu
[PATCH v4 04/22] iommu/vt-d: add bind_pasid_table function
Add Intel VT-d ops to the generic iommu_bind_pasid_table API functions. The primary use case is for direct assignment of SVM capable device. Originated from emulated IOMMU in the guest, the request goes through many layers (e.g. VFIO). Upon calling host IOMMU driver, caller passes guest PASID table pointer (GPA) and size. Device context table entry is modified by Intel IOMMU specific bind_pasid_table function. This will turn on nesting mode and matching translation type. The unbind operation restores default context mapping. Signed-off-by: Jacob PanSigned-off-by: Liu, Yi L Signed-off-by: Ashok Raj --- drivers/iommu/intel-iommu.c | 119 ++ include/linux/dma_remapping.h | 1 + 2 files changed, 120 insertions(+) diff --git a/drivers/iommu/intel-iommu.c b/drivers/iommu/intel-iommu.c index 15fa30d..8667727 100644 --- a/drivers/iommu/intel-iommu.c +++ b/drivers/iommu/intel-iommu.c @@ -2432,6 +2432,7 @@ static struct dmar_domain *dmar_insert_one_dev_info(struct intel_iommu *iommu, info->ats_supported = info->pasid_supported = info->pri_supported = 0; info->ats_enabled = info->pasid_enabled = info->pri_enabled = 0; info->ats_qdep = 0; + info->pasid_table_bound = 0; info->dev = dev; info->domain = domain; info->iommu = iommu; @@ -5189,6 +5190,7 @@ static void intel_iommu_put_resv_regions(struct device *dev, #ifdef CONFIG_INTEL_IOMMU_SVM #define MAX_NR_PASID_BITS (20) +#define MIN_NR_PASID_BITS (5) static inline unsigned long intel_iommu_get_pts(struct intel_iommu *iommu) { /* @@ -5315,6 +5317,119 @@ struct intel_iommu *intel_svm_device_to_iommu(struct device *dev) return iommu; } + +static int intel_iommu_bind_pasid_table(struct iommu_domain *domain, + struct device *dev, struct pasid_table_config *pasidt_binfo) +{ + struct intel_iommu *iommu; + struct context_entry *context; + struct dmar_domain *dmar_domain = to_dmar_domain(domain); + struct device_domain_info *info; + struct pci_dev *pdev; + u8 bus, devfn, host_table_pasid_bits; + u16 did, sid; + int ret = 0; + unsigned long flags; + u64 ctx_lo; + + iommu = device_to_iommu(dev, , ); + if (!iommu) + return -ENODEV; + /* VT-d spec section 9.4 says pasid table size is encoded as 2^(x+5) */ + host_table_pasid_bits = intel_iommu_get_pts(iommu) + MIN_NR_PASID_BITS; + if (!pasidt_binfo || pasidt_binfo->pasid_bits > host_table_pasid_bits || + pasidt_binfo->pasid_bits < MIN_NR_PASID_BITS) { + pr_err("Invalid gPASID bits %d, host range %d - %d\n", + pasidt_binfo->pasid_bits, + MIN_NR_PASID_BITS, host_table_pasid_bits); + return -ERANGE; + } + if (!ecap_nest(iommu->ecap)) { + dev_err(dev, "Cannot bind PASID table, no nested translation\n"); + ret = -EINVAL; + goto out; + } + pdev = to_pci_dev(dev); + sid = PCI_DEVID(bus, devfn); + info = dev->archdata.iommu; + + if (!info) { + dev_err(dev, "Invalid device domain info\n"); + ret = -EINVAL; + goto out; + } + if (info->pasid_table_bound) { + dev_err(dev, "Device PASID table already bound\n"); + ret = -EBUSY; + goto out; + } + if (!info->pasid_enabled) { + ret = pci_enable_pasid(pdev, info->pasid_supported & ~1); + if (ret) { + dev_err(dev, "Failed to enable PASID\n"); + goto out; + } + } + spin_lock_irqsave(>lock, flags); + context = iommu_context_addr(iommu, bus, devfn, 0); + if (!context_present(context)) { + dev_err(dev, "Context not present\n"); + ret = -EINVAL; + goto out_unlock; + } + + /* Anticipate guest to use SVM and owns the first level, so we turn +* nested mode on +*/ + ctx_lo = context[0].lo; + ctx_lo |= CONTEXT_NESTE | CONTEXT_PRS | CONTEXT_PASIDE; + ctx_lo &= ~CONTEXT_TT_MASK; + ctx_lo |= CONTEXT_TT_DEV_IOTLB << 2; + context[0].lo = ctx_lo; + + /* Assign guest PASID table pointer and size order */ + ctx_lo = (pasidt_binfo->base_ptr & VTD_PAGE_MASK) | + (pasidt_binfo->pasid_bits - MIN_NR_PASID_BITS); + context[1].lo = ctx_lo; + /* make sure context entry is updated before flushing */ + wmb(); + did = dmar_domain->iommu_did[iommu->seq_id]; + iommu->flush.flush_context(iommu, did, + (((u16)bus) << 8) | devfn, + DMA_CCMD_MASK_NOBIT, + DMA_CCMD_DEVICE_INVL); +
[PATCH v4 06/22] iommu/vt-d: add definitions for PFSID
When SRIOV VF device IOTLB is invalidated, we need to provide the PF source ID such that IOMMU hardware can gauge the depth of invalidation queue which is shared among VFs. This is needed when device invalidation throttle (DIT) capability is supported. This patch adds bit definitions for checking and tracking PFSID. Signed-off-by: Jacob Pan--- include/linux/intel-iommu.h | 4 1 file changed, 4 insertions(+) diff --git a/include/linux/intel-iommu.h b/include/linux/intel-iommu.h index e4a16dc..0e3b618 100644 --- a/include/linux/intel-iommu.h +++ b/include/linux/intel-iommu.h @@ -114,6 +114,7 @@ * Extended Capability Register */ +#define ecap_dit(e)((e >> 41) & 0x1) #define ecap_pasid(e) ((e >> 40) & 0x1) #define ecap_pss(e)((e >> 35) & 0x1f) #define ecap_eafs(e) ((e >> 34) & 0x1) @@ -284,6 +285,7 @@ enum { #define QI_DEV_IOTLB_SID(sid) ((u64)((sid) & 0x) << 32) #define QI_DEV_IOTLB_QDEP(qdep)(((qdep) & 0x1f) << 16) #define QI_DEV_IOTLB_ADDR(addr)((u64)(addr) & VTD_PAGE_MASK) +#define QI_DEV_IOTLB_PFSID(pfsid) (((u64)(pfsid & 0xf) << 12) | ((u64)(pfsid & 0xff0) << 48)) #define QI_DEV_IOTLB_SIZE 1 #define QI_DEV_IOTLB_MAX_INVS 32 @@ -308,6 +310,7 @@ enum { #define QI_DEV_EIOTLB_PASID(p) (((u64)p) << 32) #define QI_DEV_EIOTLB_SID(sid) ((u64)((sid) & 0x) << 16) #define QI_DEV_EIOTLB_QDEP(qd) ((u64)((qd) & 0x1f) << 4) +#define QI_DEV_EIOTLB_PFSID(pfsid) (((u64)(pfsid & 0xf) << 12) | ((u64)(pfsid & 0xff0) << 48)) #define QI_DEV_EIOTLB_MAX_INVS 32 #define QI_PGRP_IDX(idx) (((u64)(idx)) << 55) @@ -440,6 +443,7 @@ struct device_domain_info { struct list_head global; /* link to global list */ u8 bus; /* PCI bus number */ u8 devfn; /* PCI devfn number */ + u16 pfsid; /* SRIOV physical function source ID */ u8 pasid_supported:3; u8 pasid_enabled:1; u8 pri_supported:1; -- 2.7.4 ___ iommu mailing list iommu@lists.linux-foundation.org https://lists.linuxfoundation.org/mailman/listinfo/iommu
[PATCH v4 05/22] iommu: introduce iommu invalidate API function
From: "Liu, Yi L"When an SVM capable device is assigned to a guest, the first level page tables are owned by the guest and the guest PASID table pointer is linked to the device context entry of the physical IOMMU. Host IOMMU driver has no knowledge of caching structure updates unless the guest invalidation activities are passed down to the host. The primary usage is derived from emulated IOMMU in the guest, where QEMU can trap invalidation activities before passing them down to the host/physical IOMMU. Since the invalidation data are obtained from user space and will be written into physical IOMMU, we must allow security check at various layers. Therefore, generic invalidation data format are proposed here, model specific IOMMU drivers need to convert them into their own format. Signed-off-by: Liu, Yi L Signed-off-by: Jean-Philippe Brucker Signed-off-by: Jacob Pan Signed-off-by: Ashok Raj --- drivers/iommu/iommu.c | 14 include/linux/iommu.h | 12 +++ include/uapi/linux/iommu.h | 79 ++ 3 files changed, 105 insertions(+) diff --git a/drivers/iommu/iommu.c b/drivers/iommu/iommu.c index 734ed09..466f789 100644 --- a/drivers/iommu/iommu.c +++ b/drivers/iommu/iommu.c @@ -1344,6 +1344,20 @@ void iommu_unbind_pasid_table(struct iommu_domain *domain, struct device *dev) } EXPORT_SYMBOL_GPL(iommu_unbind_pasid_table); +int iommu_sva_invalidate(struct iommu_domain *domain, + struct device *dev, struct tlb_invalidate_info *inv_info) +{ + int ret = 0; + + if (unlikely(!domain->ops->sva_invalidate)) + return -ENODEV; + + ret = domain->ops->sva_invalidate(domain, dev, inv_info); + + return ret; +} +EXPORT_SYMBOL_GPL(iommu_sva_invalidate); + static void __iommu_detach_device(struct iommu_domain *domain, struct device *dev) { diff --git a/include/linux/iommu.h b/include/linux/iommu.h index 0f6f6c5..da684a7 100644 --- a/include/linux/iommu.h +++ b/include/linux/iommu.h @@ -190,6 +190,7 @@ struct iommu_resv_region { * @pgsize_bitmap: bitmap of all possible supported page sizes * @bind_pasid_table: bind pasid table pointer for guest SVM * @unbind_pasid_table: unbind pasid table pointer and restore defaults + * @sva_invalidate: invalidate translation caches of shared virtual address */ struct iommu_ops { bool (*capable)(enum iommu_cap); @@ -243,6 +244,8 @@ struct iommu_ops { struct pasid_table_config *pasidt_binfo); void (*unbind_pasid_table)(struct iommu_domain *domain, struct device *dev); + int (*sva_invalidate)(struct iommu_domain *domain, + struct device *dev, struct tlb_invalidate_info *inv_info); unsigned long pgsize_bitmap; }; @@ -309,6 +312,9 @@ extern int iommu_bind_pasid_table(struct iommu_domain *domain, struct device *dev, struct pasid_table_config *pasidt_binfo); extern void iommu_unbind_pasid_table(struct iommu_domain *domain, struct device *dev); +extern int iommu_sva_invalidate(struct iommu_domain *domain, + struct device *dev, struct tlb_invalidate_info *inv_info); + extern struct iommu_domain *iommu_get_domain_for_dev(struct device *dev); extern int iommu_map(struct iommu_domain *domain, unsigned long iova, phys_addr_t paddr, size_t size, int prot); @@ -720,6 +726,12 @@ void iommu_unbind_pasid_table(struct iommu_domain *domain, struct device *dev) { } +static inline int iommu_sva_invalidate(struct iommu_domain *domain, + struct device *dev, struct tlb_invalidate_info *inv_info) +{ + return -EINVAL; +} + #endif /* CONFIG_IOMMU_API */ #endif /* __LINUX_IOMMU_H */ diff --git a/include/uapi/linux/iommu.h b/include/uapi/linux/iommu.h index 9f7a6bf..4447943 100644 --- a/include/uapi/linux/iommu.h +++ b/include/uapi/linux/iommu.h @@ -29,4 +29,83 @@ struct pasid_table_config { __u8 pasid_bits; }; +/** + * enum iommu_inv_granularity - Generic invalidation granularity + * + * When an invalidation request is sent to IOMMU to flush translation caches, + * it may carry different granularity. These granularity levels are not specific + * to a type of translation cache. For an example, PASID selective granularity + * is only applicable to PASID cache invalidation. + * This enum is a collection of granularities for all types of translation + * caches. The idea is to make it easy for IOMMU model specific driver do + * conversion from generic to model specific value. + */ +enum iommu_inv_granularity { + IOMMU_INV_GRANU_DOMAIN = 1, /* all TLBs associated with a domain */ + IOMMU_INV_GRANU_DEVICE, /* caching structure associated with a +
[PATCH v4 00/22] IOMMU and VT-d driver support for Shared Virtual Address (SVA)
Hi All, Shared virtual address (SVA), a.k.a, Shared virtual memory (SVM) on Intel platforms allow address space sharing between device DMA and applications. SVA can reduce programming complexity and enhance security. To enable SVA in the guest, i.e. shared guest application address space and physical device DMA address, IOMMU driver must provide some new functionalities. This patchset is a follow-up on the discussions held at LPC 2017 VFIO/IOMMU/PCI track. Slides and notes can be found here: https://linuxplumbersconf.org/2017/ocw/events/LPC2017/tracks/636 The complete guest SVA support also involves changes in QEMU and VFIO, which has been posted earlier. https://www.spinics.net/lists/kvm/msg148798.html This is the IOMMU portion follow up of the more complete series of the kernel changes to support vSVA. Please refer to the link below for more details. https://www.spinics.net/lists/kvm/msg148819.html Generic APIs are introduced in addition to Intel VT-d specific changes, the goal is to have common interfaces across IOMMU and device types for both VFIO and other in-kernel users. At the top level, new IOMMU interfaces are introduced as follows: - bind guest PASID table - passdown invalidations of translation caches - IOMMU device fault reporting including page request/response and non-recoverable faults. For IOMMU detected device fault reporting, struct device is extended to provide callback and tracking at device level. The original proposal was discussed here "Error handling for I/O memory management units" (https://lwn.net/Articles/608914/). I have experimented two alternative solutions: 1. use a shared group notifier, this does not scale well also causes unwanted notification traffic when group sibling device is reported with faults. 2. place fault callback at device IOMMU arch data, e.g. device_domain_info in Intel/FSL IOMMU driver. This will cause code duplication, since per device fault reporting is generic. The additional patches are Intel VT-d specific, which either implements or replaces existing private interfaces with the generic ones. This patchset is based on the work and ideas from many people, especially: Ashok RajLiu, Yi L Jean-Philippe Brucker Thanks, Jacob V4 - Futher integrate feedback for iommu_param and iommu_fault_param from Jean and others. - Handle fault reporting error and race conditions. Keep tracking per device pending page requests such that page group response can be sanitized. - Added a timer to handle irresponsive guest who does not send page response on time. - Use a workqueue for VT-d non-recorverable IRQ fault handling. - Added trace events for invalidation and fault reporting. V3 - Consolidated fault reporting data format based on discussions on v2, including input from ARM and AMD. - Renamed invalidation APIs from svm to sva based on discussions on v2 - Use a parent pointer under struct device for all iommu per device data - Simplified device fault callback, allow driver private data to be registered. This might make it easy to replace domain fault handler. V2 - Replaced hybrid interface data model (generic data + vendor specific data) with all generic data. This will have the security benefit where data passed from user space can be sanitized by all software layers if needed. - Addressed review comments from V1 - Use per device fault report data - Support page request/response communications between host IOMMU and guest or other in-kernel users. - Added unrecoverable fault reporting to DMAR - Use threaded IRQ function for DMAR fault interrupt and fault reporting Jacob Pan (21): iommu: introduce bind_pasid_table API function iommu/vt-d: move device_domain_info to header iommu/vt-d: add a flag for pasid table bound status iommu/vt-d: add bind_pasid_table function iommu/vt-d: add definitions for PFSID iommu/vt-d: fix dev iotlb pfsid use iommu/vt-d: support flushing more translation cache types iommu/vt-d: add svm/sva invalidate function iommu: introduce device fault data driver core: add per device iommu param iommu: introduce device fault report API iommu: introduce page response function iommu: handle page response timeout iommu/config: add build dependency for dmar iommu/vt-d: report non-recoverable faults to device iommu/intel-svm: report device page request iommu/intel-svm: replace dev ops with fault report API iommu/intel-svm: do not flush iotlb for viommu iommu/vt-d: add intel iommu page response function trace/iommu: add sva trace events iommu: use sva invalidate and device fault trace event Liu, Yi L (1): iommu: introduce iommu invalidate API function drivers/iommu/Kconfig | 1 +
[PATCH v4 01/22] iommu: introduce bind_pasid_table API function
Virtual IOMMU was proposed to support Shared Virtual Memory (SVM) use in the guest: https://lists.gnu.org/archive/html/qemu-devel/2016-11/msg05311.html As part of the proposed architecture, when an SVM capable PCI device is assigned to a guest, nested mode is turned on. Guest owns the first level page tables (request with PASID) which performs GVA->GPA translation. Second level page tables are owned by the host for GPA->HPA translation for both request with and without PASID. A new IOMMU driver interface is therefore needed to perform tasks as follows: * Enable nested translation and appropriate translation type * Assign guest PASID table pointer (in GPA) and size to host IOMMU This patch introduces new API functions to perform bind/unbind guest PASID tables. Based on common data, model specific IOMMU drivers can be extended to perform the specific steps for binding pasid table of assigned devices. Signed-off-by: Jean-Philippe BruckerSigned-off-by: Liu, Yi L Signed-off-by: Ashok Raj Signed-off-by: Jacob Pan --- drivers/iommu/iommu.c | 19 +++ include/linux/iommu.h | 24 include/uapi/linux/iommu.h | 32 3 files changed, 75 insertions(+) create mode 100644 include/uapi/linux/iommu.h diff --git a/drivers/iommu/iommu.c b/drivers/iommu/iommu.c index 69fef99..734ed09 100644 --- a/drivers/iommu/iommu.c +++ b/drivers/iommu/iommu.c @@ -1325,6 +1325,25 @@ int iommu_attach_device(struct iommu_domain *domain, struct device *dev) } EXPORT_SYMBOL_GPL(iommu_attach_device); +int iommu_bind_pasid_table(struct iommu_domain *domain, struct device *dev, + struct pasid_table_config *pasidt_binfo) +{ + if (unlikely(!domain->ops->bind_pasid_table)) + return -ENODEV; + + return domain->ops->bind_pasid_table(domain, dev, pasidt_binfo); +} +EXPORT_SYMBOL_GPL(iommu_bind_pasid_table); + +void iommu_unbind_pasid_table(struct iommu_domain *domain, struct device *dev) +{ + if (unlikely(!domain->ops->unbind_pasid_table)) + return; + + domain->ops->unbind_pasid_table(domain, dev); +} +EXPORT_SYMBOL_GPL(iommu_unbind_pasid_table); + static void __iommu_detach_device(struct iommu_domain *domain, struct device *dev) { diff --git a/include/linux/iommu.h b/include/linux/iommu.h index 41b8c57..0f6f6c5 100644 --- a/include/linux/iommu.h +++ b/include/linux/iommu.h @@ -25,6 +25,7 @@ #include #include #include +#include #define IOMMU_READ (1 << 0) #define IOMMU_WRITE(1 << 1) @@ -187,6 +188,8 @@ struct iommu_resv_region { * @domain_get_windows: Return the number of windows for a domain * @of_xlate: add OF master IDs to iommu grouping * @pgsize_bitmap: bitmap of all possible supported page sizes + * @bind_pasid_table: bind pasid table pointer for guest SVM + * @unbind_pasid_table: unbind pasid table pointer and restore defaults */ struct iommu_ops { bool (*capable)(enum iommu_cap); @@ -233,8 +236,14 @@ struct iommu_ops { u32 (*domain_get_windows)(struct iommu_domain *domain); int (*of_xlate)(struct device *dev, struct of_phandle_args *args); + bool (*is_attach_deferred)(struct iommu_domain *domain, struct device *dev); + int (*bind_pasid_table)(struct iommu_domain *domain, struct device *dev, + struct pasid_table_config *pasidt_binfo); + void (*unbind_pasid_table)(struct iommu_domain *domain, + struct device *dev); + unsigned long pgsize_bitmap; }; @@ -296,6 +305,10 @@ extern int iommu_attach_device(struct iommu_domain *domain, struct device *dev); extern void iommu_detach_device(struct iommu_domain *domain, struct device *dev); +extern int iommu_bind_pasid_table(struct iommu_domain *domain, + struct device *dev, struct pasid_table_config *pasidt_binfo); +extern void iommu_unbind_pasid_table(struct iommu_domain *domain, + struct device *dev); extern struct iommu_domain *iommu_get_domain_for_dev(struct device *dev); extern int iommu_map(struct iommu_domain *domain, unsigned long iova, phys_addr_t paddr, size_t size, int prot); @@ -696,6 +709,17 @@ const struct iommu_ops *iommu_ops_from_fwnode(struct fwnode_handle *fwnode) return NULL; } +static inline +int iommu_bind_pasid_table(struct iommu_domain *domain, struct device *dev, + struct pasid_table_config *pasidt_binfo) +{ + return -EINVAL; +} +static inline +void iommu_unbind_pasid_table(struct iommu_domain *domain, struct device *dev) +{ +} + #endif /* CONFIG_IOMMU_API */ #endif /* __LINUX_IOMMU_H */ diff --git a/include/uapi/linux/iommu.h b/include/uapi/linux/iommu.h
[PATCH v4 03/22] iommu/vt-d: add a flag for pasid table bound status
Adding a flag in device domain into to track whether a guest or user PASID table is bound to a device. Signed-off-by: Jacob Pan--- include/linux/intel-iommu.h | 1 + 1 file changed, 1 insertion(+) diff --git a/include/linux/intel-iommu.h b/include/linux/intel-iommu.h index 13b44bb..e4a16dc 100644 --- a/include/linux/intel-iommu.h +++ b/include/linux/intel-iommu.h @@ -446,6 +446,7 @@ struct device_domain_info { u8 pri_enabled:1; u8 ats_supported:1; u8 ats_enabled:1; + u8 pasid_table_bound:1; u8 ats_qdep; u64 fault_mask; /* selected IOMMU faults to be reported */ struct device *dev; /* it's NULL for PCIe-to-PCI bridge */ -- 2.7.4 ___ iommu mailing list iommu@lists.linux-foundation.org https://lists.linuxfoundation.org/mailman/listinfo/iommu
Re: [PATCH v5 7/7] iommu/dma: Move PCI window region reservation back into dma specific path.
On 22/03/18 16:21, Alex Williamson wrote: On Thu, 15 Mar 2018 16:35:09 + Shameer Kolothumwrote: This pretty much reverts commit 273df9635385 ("iommu/dma: Make PCI window reservation generic") by moving the PCI window region reservation back into the dma specific path so that these regions doesn't get exposed via the IOMMU API interface. With this change, the vfio interface will report only iommu specific reserved regions to the user space. Cc: Robin Murphy Cc: Joerg Roedel Signed-off-by: Shameer Kolothum --- As currently ordered, we expose the iova list to the user in 5/7 with the PCI window reservations still intact. Isn't that a bisection problem? This patch should come before the iova list is expose to the user. This is otherwise independent, so I can pop it up in the stack on commit, but I'd need an ack from Joerg and Robin to take it via my tree. Thanks, If it counts, the changes look right, so: Acked-by: Robin Murphy but it does look like there's a hard dependency on Joerg's core branch where Shameer's ITS workaround patches are currently queued. Otherwise, though, I don't think there's anything else due to be touching iommu-dma just yet. Robin. Alex drivers/iommu/dma-iommu.c | 54 ++- 1 file changed, 25 insertions(+), 29 deletions(-) diff --git a/drivers/iommu/dma-iommu.c b/drivers/iommu/dma-iommu.c index f05f3cf..ddcbbdb 100644 --- a/drivers/iommu/dma-iommu.c +++ b/drivers/iommu/dma-iommu.c @@ -167,40 +167,16 @@ EXPORT_SYMBOL(iommu_put_dma_cookie); * @list: Reserved region list from iommu_get_resv_regions() * * IOMMU drivers can use this to implement their .get_resv_regions callback - * for general non-IOMMU-specific reservations. Currently, this covers host - * bridge windows for PCI devices and GICv3 ITS region reservation on ACPI - * based ARM platforms that may require HW MSI reservation. + * for general non-IOMMU-specific reservations. Currently, this covers GICv3 + * ITS region reservation on ACPI based ARM platforms that may require HW MSI + * reservation. */ void iommu_dma_get_resv_regions(struct device *dev, struct list_head *list) { - struct pci_host_bridge *bridge; - struct resource_entry *window; - - if (!is_of_node(dev->iommu_fwspec->iommu_fwnode) && - iort_iommu_msi_get_resv_regions(dev, list) < 0) - return; - - if (!dev_is_pci(dev)) - return; - - bridge = pci_find_host_bridge(to_pci_dev(dev)->bus); - resource_list_for_each_entry(window, >windows) { - struct iommu_resv_region *region; - phys_addr_t start; - size_t length; - - if (resource_type(window->res) != IORESOURCE_MEM) - continue; - start = window->res->start - window->offset; - length = window->res->end - window->res->start + 1; - region = iommu_alloc_resv_region(start, length, 0, - IOMMU_RESV_RESERVED); - if (!region) - return; + if (!is_of_node(dev->iommu_fwspec->iommu_fwnode)) + iort_iommu_msi_get_resv_regions(dev, list); - list_add_tail(>list, list); - } } EXPORT_SYMBOL(iommu_dma_get_resv_regions); @@ -229,6 +205,23 @@ static int cookie_init_hw_msi_region(struct iommu_dma_cookie *cookie, return 0; } +static void iova_reserve_pci_windows(struct pci_dev *dev, + struct iova_domain *iovad) +{ + struct pci_host_bridge *bridge = pci_find_host_bridge(dev->bus); + struct resource_entry *window; + unsigned long lo, hi; + + resource_list_for_each_entry(window, >windows) { + if (resource_type(window->res) != IORESOURCE_MEM) + continue; + + lo = iova_pfn(iovad, window->res->start - window->offset); + hi = iova_pfn(iovad, window->res->end - window->offset); + reserve_iova(iovad, lo, hi); + } +} + static int iova_reserve_iommu_regions(struct device *dev, struct iommu_domain *domain) { @@ -238,6 +231,9 @@ static int iova_reserve_iommu_regions(struct device *dev, LIST_HEAD(resv_regions); int ret = 0; + if (dev_is_pci(dev)) + iova_reserve_pci_windows(to_pci_dev(dev), iovad); + iommu_get_resv_regions(dev, _regions); list_for_each_entry(region, _regions, list) { unsigned long lo, hi; ___ iommu mailing list iommu@lists.linux-foundation.org https://lists.linuxfoundation.org/mailman/listinfo/iommu
Re: [PATCH v5 7/7] iommu/dma: Move PCI window region reservation back into dma specific path.
On Thu, 15 Mar 2018 16:35:09 + Shameer Kolothumwrote: > This pretty much reverts commit 273df9635385 ("iommu/dma: Make PCI > window reservation generic") by moving the PCI window region > reservation back into the dma specific path so that these regions > doesn't get exposed via the IOMMU API interface. With this change, > the vfio interface will report only iommu specific reserved regions > to the user space. > > Cc: Robin Murphy > Cc: Joerg Roedel > Signed-off-by: Shameer Kolothum > --- As currently ordered, we expose the iova list to the user in 5/7 with the PCI window reservations still intact. Isn't that a bisection problem? This patch should come before the iova list is expose to the user. This is otherwise independent, so I can pop it up in the stack on commit, but I'd need an ack from Joerg and Robin to take it via my tree. Thanks, Alex > drivers/iommu/dma-iommu.c | 54 > ++- > 1 file changed, 25 insertions(+), 29 deletions(-) > > diff --git a/drivers/iommu/dma-iommu.c b/drivers/iommu/dma-iommu.c > index f05f3cf..ddcbbdb 100644 > --- a/drivers/iommu/dma-iommu.c > +++ b/drivers/iommu/dma-iommu.c > @@ -167,40 +167,16 @@ EXPORT_SYMBOL(iommu_put_dma_cookie); > * @list: Reserved region list from iommu_get_resv_regions() > * > * IOMMU drivers can use this to implement their .get_resv_regions callback > - * for general non-IOMMU-specific reservations. Currently, this covers host > - * bridge windows for PCI devices and GICv3 ITS region reservation on ACPI > - * based ARM platforms that may require HW MSI reservation. > + * for general non-IOMMU-specific reservations. Currently, this covers GICv3 > + * ITS region reservation on ACPI based ARM platforms that may require HW MSI > + * reservation. > */ > void iommu_dma_get_resv_regions(struct device *dev, struct list_head *list) > { > - struct pci_host_bridge *bridge; > - struct resource_entry *window; > - > - if (!is_of_node(dev->iommu_fwspec->iommu_fwnode) && > - iort_iommu_msi_get_resv_regions(dev, list) < 0) > - return; > - > - if (!dev_is_pci(dev)) > - return; > - > - bridge = pci_find_host_bridge(to_pci_dev(dev)->bus); > - resource_list_for_each_entry(window, >windows) { > - struct iommu_resv_region *region; > - phys_addr_t start; > - size_t length; > - > - if (resource_type(window->res) != IORESOURCE_MEM) > - continue; > > - start = window->res->start - window->offset; > - length = window->res->end - window->res->start + 1; > - region = iommu_alloc_resv_region(start, length, 0, > - IOMMU_RESV_RESERVED); > - if (!region) > - return; > + if (!is_of_node(dev->iommu_fwspec->iommu_fwnode)) > + iort_iommu_msi_get_resv_regions(dev, list); > > - list_add_tail(>list, list); > - } > } > EXPORT_SYMBOL(iommu_dma_get_resv_regions); > > @@ -229,6 +205,23 @@ static int cookie_init_hw_msi_region(struct > iommu_dma_cookie *cookie, > return 0; > } > > +static void iova_reserve_pci_windows(struct pci_dev *dev, > + struct iova_domain *iovad) > +{ > + struct pci_host_bridge *bridge = pci_find_host_bridge(dev->bus); > + struct resource_entry *window; > + unsigned long lo, hi; > + > + resource_list_for_each_entry(window, >windows) { > + if (resource_type(window->res) != IORESOURCE_MEM) > + continue; > + > + lo = iova_pfn(iovad, window->res->start - window->offset); > + hi = iova_pfn(iovad, window->res->end - window->offset); > + reserve_iova(iovad, lo, hi); > + } > +} > + > static int iova_reserve_iommu_regions(struct device *dev, > struct iommu_domain *domain) > { > @@ -238,6 +231,9 @@ static int iova_reserve_iommu_regions(struct device *dev, > LIST_HEAD(resv_regions); > int ret = 0; > > + if (dev_is_pci(dev)) > + iova_reserve_pci_windows(to_pci_dev(dev), iovad); > + > iommu_get_resv_regions(dev, _regions); > list_for_each_entry(region, _regions, list) { > unsigned long lo, hi; ___ iommu mailing list iommu@lists.linux-foundation.org https://lists.linuxfoundation.org/mailman/listinfo/iommu
[PATCH 08/10] iommu/amd: drop the lock while allocating new irq remap table
The irq_remap_table is allocated while the iommu_table_lock is held with interrupts disabled. >From looking at the call sites, all callers are in the early device initialisation (apic_bsp_setup(), pci_enable_device(), pci_enable_msi()) so make sense to drop the lock which also enables interrupts and try to allocate that memory with GFP_KERNEL instead GFP_ATOMIC. Since during the allocation the iommu_table_lock is dropped, we need to recheck if table exists after the lock has been reacquired. I *think* that it is impossible that the "devid" entry appears in irq_lookup_table while the lock is dropped since the same device can only be probed once. However I check for both cases, just to be sure. Signed-off-by: Sebastian Andrzej Siewior--- drivers/iommu/amd_iommu.c | 65 +-- 1 file changed, 46 insertions(+), 19 deletions(-) diff --git a/drivers/iommu/amd_iommu.c b/drivers/iommu/amd_iommu.c index 11ea2d656be8..c493d345b3ef 100644 --- a/drivers/iommu/amd_iommu.c +++ b/drivers/iommu/amd_iommu.c @@ -3617,6 +3617,30 @@ static struct irq_remap_table *get_irq_table(u16 devid) return table; } +static struct irq_remap_table *__alloc_irq_table(void) +{ + struct irq_remap_table *table; + + table = kzalloc(sizeof(*table), GFP_KERNEL); + if (!table) + return NULL; + + table->table = kmem_cache_alloc(amd_iommu_irq_cache, GFP_KERNEL); + if (!table->table) { + kfree(table); + return NULL; + } + raw_spin_lock_init(>lock); + + if (!AMD_IOMMU_GUEST_IR_GA(amd_iommu_guest_ir)) + memset(table->table, 0, + MAX_IRQS_PER_TABLE * sizeof(u32)); + else + memset(table->table, 0, + (MAX_IRQS_PER_TABLE * (sizeof(u64) * 2))); + return table; +} + static void set_remap_table_entry(struct amd_iommu *iommu, u16 devid, struct irq_remap_table *table) { @@ -3628,6 +3652,7 @@ static void set_remap_table_entry(struct amd_iommu *iommu, u16 devid, static struct irq_remap_table *alloc_irq_table(u16 devid) { struct irq_remap_table *table = NULL; + struct irq_remap_table *new_table = NULL; struct amd_iommu *iommu; unsigned long flags; u16 alias; @@ -3646,42 +3671,44 @@ static struct irq_remap_table *alloc_irq_table(u16 devid) table = irq_lookup_table[alias]; if (table) { set_remap_table_entry(iommu, devid, table); - goto out; + goto out_wait; } + spin_unlock_irqrestore(_table_lock, flags); /* Nothing there yet, allocate new irq remapping table */ - table = kzalloc(sizeof(*table), GFP_ATOMIC); - if (!table) + new_table = __alloc_irq_table(); + if (!new_table) + return NULL; + + spin_lock_irqsave(_table_lock, flags); + + table = irq_lookup_table[devid]; + if (table) goto out_unlock; - /* Initialize table spin-lock */ - raw_spin_lock_init(>lock); - - table->table = kmem_cache_alloc(amd_iommu_irq_cache, GFP_ATOMIC); - if (!table->table) { - kfree(table); - table = NULL; - goto out_unlock; + table = irq_lookup_table[alias]; + if (table) { + set_remap_table_entry(iommu, devid, table); + goto out_wait; } - if (!AMD_IOMMU_GUEST_IR_GA(amd_iommu_guest_ir)) - memset(table->table, 0, - MAX_IRQS_PER_TABLE * sizeof(u32)); - else - memset(table->table, 0, - (MAX_IRQS_PER_TABLE * (sizeof(u64) * 2))); - + table = new_table; + new_table = NULL; set_remap_table_entry(iommu, devid, table); if (devid != alias) set_remap_table_entry(iommu, alias, table); -out: +out_wait: iommu_completion_wait(iommu); out_unlock: spin_unlock_irqrestore(_table_lock, flags); + if (new_table) { + kmem_cache_free(amd_iommu_irq_cache, new_table->table); + kfree(new_table); + } return table; } -- 2.16.2 ___ iommu mailing list iommu@lists.linux-foundation.org https://lists.linuxfoundation.org/mailman/listinfo/iommu
[PATCH 05/10] iommu/amd: remove the special case from alloc_irq_table()
alloc_irq_table() has a special ioapic argument. If set then it will pre-allocate / reserve the first 32 indexes. The argument is only once true and it would make alloc_irq_table() a little simpler if we would extract the special bits to the caller. The caller of irq_remapping_alloc() is holding irq_domain_mutex so the initialization of iommu->irte_ops->set_allocated() should not race against other user. Signed-off-by: Sebastian Andrzej Siewior--- drivers/iommu/amd_iommu.c | 34 -- 1 file changed, 20 insertions(+), 14 deletions(-) diff --git a/drivers/iommu/amd_iommu.c b/drivers/iommu/amd_iommu.c index a87d2fee68db..a5982a61f181 100644 --- a/drivers/iommu/amd_iommu.c +++ b/drivers/iommu/amd_iommu.c @@ -3617,7 +3617,7 @@ static struct irq_remap_table *get_irq_table(u16 devid) return table; } -static struct irq_remap_table *alloc_irq_table(u16 devid, bool ioapic) +static struct irq_remap_table *alloc_irq_table(u16 devid) { struct irq_remap_table *table = NULL; struct amd_iommu *iommu; @@ -3651,10 +3651,6 @@ static struct irq_remap_table *alloc_irq_table(u16 devid, bool ioapic) /* Initialize table spin-lock */ raw_spin_lock_init(>lock); - if (ioapic) - /* Keep the first 32 indexes free for IOAPIC interrupts */ - table->min_index = 32; - table->table = kmem_cache_alloc(amd_iommu_irq_cache, GFP_ATOMIC); if (!table->table) { kfree(table); @@ -3669,12 +3665,6 @@ static struct irq_remap_table *alloc_irq_table(u16 devid, bool ioapic) memset(table->table, 0, (MAX_IRQS_PER_TABLE * (sizeof(u64) * 2))); - if (ioapic) { - int i; - - for (i = 0; i < 32; ++i) - iommu->irte_ops->set_allocated(table, i); - } irq_lookup_table[devid] = table; set_dte_irq_entry(devid, table); @@ -3704,7 +3694,7 @@ static int alloc_irq_index(u16 devid, int count, bool align) if (!iommu) return -ENODEV; - table = alloc_irq_table(devid, false); + table = alloc_irq_table(devid); if (!table) return -ENODEV; @@ -4130,10 +4120,26 @@ static int irq_remapping_alloc(struct irq_domain *domain, unsigned int virq, return ret; if (info->type == X86_IRQ_ALLOC_TYPE_IOAPIC) { - if (alloc_irq_table(devid, true)) + struct irq_remap_table *table; + struct amd_iommu *iommu; + + table = alloc_irq_table(devid); + if (table) { + if (!table->min_index) { + /* +* Keep the first 32 indexes free for IOAPIC +* interrupts. +*/ + table->min_index = 32; + iommu = amd_iommu_rlookup_table[devid]; + for (i = 0; i < 32; ++i) + iommu->irte_ops->set_allocated(table, i); + } + WARN_ON(table->min_index != 32); index = info->ioapic_pin; - else + } else { ret = -ENOMEM; + } } else { bool align = (info->type == X86_IRQ_ALLOC_TYPE_MSI); -- 2.16.2 ___ iommu mailing list iommu@lists.linux-foundation.org https://lists.linuxfoundation.org/mailman/listinfo/iommu
[PATCH 09/10] iommu/amd: make amd_iommu_devtable_lock a spin_lock
Before commit 0bb6e243d7fb ("iommu/amd: Support IOMMU_DOMAIN_DMA type allocation") amd_iommu_devtable_lock had a read_lock() user but now there are none. In fact, after the mentioned commit we had only write_lock() user of the lock. Since there is no reason to keep it as writer lock, change its type to a spin_lock. I *think* that we might even be able to remove the lock because all its current user seem to have their own protection. Signed-off-by: Sebastian Andrzej Siewior--- drivers/iommu/amd_iommu.c | 14 +++--- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/drivers/iommu/amd_iommu.c b/drivers/iommu/amd_iommu.c index c493d345b3ef..23e26a4b708e 100644 --- a/drivers/iommu/amd_iommu.c +++ b/drivers/iommu/amd_iommu.c @@ -80,7 +80,7 @@ */ #define AMD_IOMMU_PGSIZES ((~0xFFFUL) & ~(2ULL << 38)) -static DEFINE_RWLOCK(amd_iommu_devtable_lock); +static DEFINE_SPINLOCK(amd_iommu_devtable_lock); static DEFINE_SPINLOCK(pd_bitmap_lock); static DEFINE_SPINLOCK(iommu_table_lock); @@ -2097,9 +2097,9 @@ static int attach_device(struct device *dev, } skip_ats_check: - write_lock_irqsave(_iommu_devtable_lock, flags); + spin_lock_irqsave(_iommu_devtable_lock, flags); ret = __attach_device(dev_data, domain); - write_unlock_irqrestore(_iommu_devtable_lock, flags); + spin_unlock_irqrestore(_iommu_devtable_lock, flags); /* * We might boot into a crash-kernel here. The crashed kernel @@ -2149,9 +2149,9 @@ static void detach_device(struct device *dev) domain = dev_data->domain; /* lock device table */ - write_lock_irqsave(_iommu_devtable_lock, flags); + spin_lock_irqsave(_iommu_devtable_lock, flags); __detach_device(dev_data); - write_unlock_irqrestore(_iommu_devtable_lock, flags); + spin_unlock_irqrestore(_iommu_devtable_lock, flags); if (!dev_is_pci(dev)) return; @@ -2814,7 +2814,7 @@ static void cleanup_domain(struct protection_domain *domain) struct iommu_dev_data *entry; unsigned long flags; - write_lock_irqsave(_iommu_devtable_lock, flags); + spin_lock_irqsave(_iommu_devtable_lock, flags); while (!list_empty(>dev_list)) { entry = list_first_entry(>dev_list, @@ -2822,7 +2822,7 @@ static void cleanup_domain(struct protection_domain *domain) __detach_device(entry); } - write_unlock_irqrestore(_iommu_devtable_lock, flags); + spin_unlock_irqrestore(_iommu_devtable_lock, flags); } static void protection_domain_free(struct protection_domain *domain) -- 2.16.2 ___ iommu mailing list iommu@lists.linux-foundation.org https://lists.linuxfoundation.org/mailman/listinfo/iommu
[PATCH 07/10] iommu/amd: factor out setting the remap table for a devid
Setting the IRQ remap table for a specific devid (or its alias devid) includes three steps. Those three steps are always repeated each time this is done. Introduce a new helper function, move those steps there and use that function instead. The compiler can still decide if it is worth to inline. Signed-off-by: Sebastian Andrzej Siewior--- drivers/iommu/amd_iommu.c | 23 --- 1 file changed, 12 insertions(+), 11 deletions(-) diff --git a/drivers/iommu/amd_iommu.c b/drivers/iommu/amd_iommu.c index ea120c7b46c9..11ea2d656be8 100644 --- a/drivers/iommu/amd_iommu.c +++ b/drivers/iommu/amd_iommu.c @@ -3617,6 +3617,14 @@ static struct irq_remap_table *get_irq_table(u16 devid) return table; } +static void set_remap_table_entry(struct amd_iommu *iommu, u16 devid, + struct irq_remap_table *table) +{ + irq_lookup_table[devid] = table; + set_dte_irq_entry(devid, table); + iommu_flush_dte(iommu, devid); +} + static struct irq_remap_table *alloc_irq_table(u16 devid) { struct irq_remap_table *table = NULL; @@ -3637,9 +3645,7 @@ static struct irq_remap_table *alloc_irq_table(u16 devid) alias = amd_iommu_alias_table[devid]; table = irq_lookup_table[alias]; if (table) { - irq_lookup_table[devid] = table; - set_dte_irq_entry(devid, table); - iommu_flush_dte(iommu, devid); + set_remap_table_entry(iommu, devid, table); goto out; } @@ -3666,14 +3672,9 @@ static struct irq_remap_table *alloc_irq_table(u16 devid) (MAX_IRQS_PER_TABLE * (sizeof(u64) * 2))); - irq_lookup_table[devid] = table; - set_dte_irq_entry(devid, table); - iommu_flush_dte(iommu, devid); - if (devid != alias) { - irq_lookup_table[alias] = table; - set_dte_irq_entry(alias, table); - iommu_flush_dte(iommu, alias); - } + set_remap_table_entry(iommu, devid, table); + if (devid != alias) + set_remap_table_entry(iommu, alias, table); out: iommu_completion_wait(iommu); -- 2.16.2 ___ iommu mailing list iommu@lists.linux-foundation.org https://lists.linuxfoundation.org/mailman/listinfo/iommu
[PATCH 10/10] iommu/amd: return proper error code in irq_remapping_alloc()
In the unlikely case when alloc_irq_table() is not able to return a remap table then "ret" will be assigned with an error code. Later, the code checks `index' and if it is negative (which it is because it is initialized with `-1') and then then function properly aborts but returns `-1' instead `-ENOMEM' what was intended. In order to correct this, I assign -ENOMEM to index. Signed-off-by: Sebastian Andrzej Siewior--- drivers/iommu/amd_iommu.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/iommu/amd_iommu.c b/drivers/iommu/amd_iommu.c index 23e26a4b708e..6107e24bed8a 100644 --- a/drivers/iommu/amd_iommu.c +++ b/drivers/iommu/amd_iommu.c @@ -4124,7 +4124,7 @@ static int irq_remapping_alloc(struct irq_domain *domain, unsigned int virq, struct amd_ir_data *data = NULL; struct irq_cfg *cfg; int i, ret, devid; - int index = -1; + int index; if (!info) return -EINVAL; @@ -4166,7 +4166,7 @@ static int irq_remapping_alloc(struct irq_domain *domain, unsigned int virq, WARN_ON(table->min_index != 32); index = info->ioapic_pin; } else { - ret = -ENOMEM; + index = -ENOMEM; } } else { bool align = (info->type == X86_IRQ_ALLOC_TYPE_MSI); -- 2.16.2 ___ iommu mailing list iommu@lists.linux-foundation.org https://lists.linuxfoundation.org/mailman/listinfo/iommu
[PATCH 06/10] iommu/amd: use `table' instead `irt' as variable name in amd_iommu_update_ga()
The variable of type struct irq_remap_table is always named `table' except in amd_iommu_update_ga() where it is called `irt'. Make it consistent and name it also `table'. Signed-off-by: Sebastian Andrzej Siewior--- drivers/iommu/amd_iommu.c | 10 +- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/drivers/iommu/amd_iommu.c b/drivers/iommu/amd_iommu.c index a5982a61f181..ea120c7b46c9 100644 --- a/drivers/iommu/amd_iommu.c +++ b/drivers/iommu/amd_iommu.c @@ -4405,7 +4405,7 @@ int amd_iommu_update_ga(int cpu, bool is_run, void *data) { unsigned long flags; struct amd_iommu *iommu; - struct irq_remap_table *irt; + struct irq_remap_table *table; struct amd_ir_data *ir_data = (struct amd_ir_data *)data; int devid = ir_data->irq_2_irte.devid; struct irte_ga *entry = (struct irte_ga *) ir_data->entry; @@ -4419,11 +4419,11 @@ int amd_iommu_update_ga(int cpu, bool is_run, void *data) if (!iommu) return -ENODEV; - irt = get_irq_table(devid); - if (!irt) + table = get_irq_table(devid); + if (!table) return -ENODEV; - raw_spin_lock_irqsave(>lock, flags); + raw_spin_lock_irqsave(>lock, flags); if (ref->lo.fields_vapic.guest_mode) { if (cpu >= 0) @@ -4432,7 +4432,7 @@ int amd_iommu_update_ga(int cpu, bool is_run, void *data) barrier(); } - raw_spin_unlock_irqrestore(>lock, flags); + raw_spin_unlock_irqrestore(>lock, flags); iommu_flush_irt(iommu, devid); iommu_completion_wait(iommu); -- 2.16.2 ___ iommu mailing list iommu@lists.linux-foundation.org https://lists.linuxfoundation.org/mailman/listinfo/iommu
[PATCH 04/10] iommu/amd: split irq_lookup_table out of the amd_iommu_devtable_lock
The function get_irq_table() reads/writes irq_lookup_table while holding the amd_iommu_devtable_lock. It also modifies amd_iommu_dev_table[].data[2]. set_dte_entry() is using amd_iommu_dev_table[].data[0|1] (under the domain->lock) so it should be okay. The access to the iommu is serialized with its own (iommu's) lock. So split out get_irq_table() out of amd_iommu_devtable_lock's lock. Signed-off-by: Sebastian Andrzej Siewior--- drivers/iommu/amd_iommu.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/drivers/iommu/amd_iommu.c b/drivers/iommu/amd_iommu.c index fcfdce70707d..a87d2fee68db 100644 --- a/drivers/iommu/amd_iommu.c +++ b/drivers/iommu/amd_iommu.c @@ -82,6 +82,7 @@ static DEFINE_RWLOCK(amd_iommu_devtable_lock); static DEFINE_SPINLOCK(pd_bitmap_lock); +static DEFINE_SPINLOCK(iommu_table_lock); /* List of all available dev_data structures */ static LLIST_HEAD(dev_data_list); @@ -3623,7 +3624,7 @@ static struct irq_remap_table *alloc_irq_table(u16 devid, bool ioapic) unsigned long flags; u16 alias; - write_lock_irqsave(_iommu_devtable_lock, flags); + spin_lock_irqsave(_table_lock, flags); iommu = amd_iommu_rlookup_table[devid]; if (!iommu) @@ -3688,7 +3689,7 @@ static struct irq_remap_table *alloc_irq_table(u16 devid, bool ioapic) iommu_completion_wait(iommu); out_unlock: - write_unlock_irqrestore(_iommu_devtable_lock, flags); + spin_unlock_irqrestore(_table_lock, flags); return table; } -- 2.16.2 ___ iommu mailing list iommu@lists.linux-foundation.org https://lists.linuxfoundation.org/mailman/listinfo/iommu
[PATCH 03/10] iommu/amd: split domain id out of amd_iommu_devtable_lock
domain_id_alloc() and domain_id_free() is used for id management. Those two function share a bitmap (amd_iommu_pd_alloc_bitmap) and set/clear bits based on id allocation. There is no need to share this with amd_iommu_devtable_lock, it can use its own lock for this operation. Signed-off-by: Sebastian Andrzej Siewior--- drivers/iommu/amd_iommu.c | 12 +--- 1 file changed, 5 insertions(+), 7 deletions(-) diff --git a/drivers/iommu/amd_iommu.c b/drivers/iommu/amd_iommu.c index d4c2b1a11924..fcfdce70707d 100644 --- a/drivers/iommu/amd_iommu.c +++ b/drivers/iommu/amd_iommu.c @@ -81,6 +81,7 @@ #define AMD_IOMMU_PGSIZES ((~0xFFFUL) & ~(2ULL << 38)) static DEFINE_RWLOCK(amd_iommu_devtable_lock); +static DEFINE_SPINLOCK(pd_bitmap_lock); /* List of all available dev_data structures */ static LLIST_HEAD(dev_data_list); @@ -1600,29 +1601,26 @@ static void del_domain_from_list(struct protection_domain *domain) static u16 domain_id_alloc(void) { - unsigned long flags; int id; - write_lock_irqsave(_iommu_devtable_lock, flags); + spin_lock(_bitmap_lock); id = find_first_zero_bit(amd_iommu_pd_alloc_bitmap, MAX_DOMAIN_ID); BUG_ON(id == 0); if (id > 0 && id < MAX_DOMAIN_ID) __set_bit(id, amd_iommu_pd_alloc_bitmap); else id = 0; - write_unlock_irqrestore(_iommu_devtable_lock, flags); + spin_unlock(_bitmap_lock); return id; } static void domain_id_free(int id) { - unsigned long flags; - - write_lock_irqsave(_iommu_devtable_lock, flags); + spin_lock(_bitmap_lock); if (id > 0 && id < MAX_DOMAIN_ID) __clear_bit(id, amd_iommu_pd_alloc_bitmap); - write_unlock_irqrestore(_iommu_devtable_lock, flags); + spin_unlock(_bitmap_lock); } #define DEFINE_FREE_PT_FN(LVL, FN) \ -- 2.16.2 ___ iommu mailing list iommu@lists.linux-foundation.org https://lists.linuxfoundation.org/mailman/listinfo/iommu
[PATCH 02/10] iommu/amd: turn dev_data_list into a lock less list
alloc_dev_data() adds new items to dev_data_list and search_dev_data() is searching for items in this list. Both protect the access to the list with a spinlock. There is no need to navigate forth and back within the list and there is also no deleting of a specific item. This qualifies the list to become a lock less list and as part of this, the spinlock can be removed. With this change the ordering of those items within the list is changed: before the change new items were added to the end of the list, now they are added to the front. I don't think it matters but wanted to mention it. Signed-off-by: Sebastian Andrzej Siewior--- drivers/iommu/amd_iommu.c | 28 ++-- drivers/iommu/amd_iommu_types.h | 2 +- 2 files changed, 11 insertions(+), 19 deletions(-) diff --git a/drivers/iommu/amd_iommu.c b/drivers/iommu/amd_iommu.c index 121c54f1c768..d4c2b1a11924 100644 --- a/drivers/iommu/amd_iommu.c +++ b/drivers/iommu/amd_iommu.c @@ -83,8 +83,7 @@ static DEFINE_RWLOCK(amd_iommu_devtable_lock); /* List of all available dev_data structures */ -static LIST_HEAD(dev_data_list); -static DEFINE_SPINLOCK(dev_data_list_lock); +static LLIST_HEAD(dev_data_list); LIST_HEAD(ioapic_map); LIST_HEAD(hpet_map); @@ -203,40 +202,33 @@ static struct dma_ops_domain* to_dma_ops_domain(struct protection_domain *domain static struct iommu_dev_data *alloc_dev_data(u16 devid) { struct iommu_dev_data *dev_data; - unsigned long flags; dev_data = kzalloc(sizeof(*dev_data), GFP_KERNEL); if (!dev_data) return NULL; dev_data->devid = devid; - - spin_lock_irqsave(_data_list_lock, flags); - list_add_tail(_data->dev_data_list, _data_list); - spin_unlock_irqrestore(_data_list_lock, flags); - ratelimit_default_init(_data->rs); + llist_add(_data->dev_data_list, _data_list); return dev_data; } static struct iommu_dev_data *search_dev_data(u16 devid) { struct iommu_dev_data *dev_data; - unsigned long flags; + struct llist_node *node; - spin_lock_irqsave(_data_list_lock, flags); - list_for_each_entry(dev_data, _data_list, dev_data_list) { + if (llist_empty(_data_list)) + return NULL; + + node = dev_data_list.first; + llist_for_each_entry(dev_data, node, dev_data_list) { if (dev_data->devid == devid) - goto out_unlock; + return dev_data; } - dev_data = NULL; - -out_unlock: - spin_unlock_irqrestore(_data_list_lock, flags); - - return dev_data; + return NULL; } static int __last_alias(struct pci_dev *pdev, u16 alias, void *data) diff --git a/drivers/iommu/amd_iommu_types.h b/drivers/iommu/amd_iommu_types.h index da886b0095aa..1c9b080276c9 100644 --- a/drivers/iommu/amd_iommu_types.h +++ b/drivers/iommu/amd_iommu_types.h @@ -627,7 +627,7 @@ struct devid_map { */ struct iommu_dev_data { struct list_head list;/* For domain->dev_list */ - struct list_head dev_data_list; /* For global dev_data_list */ + struct llist_node dev_data_list; /* For global dev_data_list */ struct protection_domain *domain; /* Domain the device is bound to */ u16 devid;/* PCI Device ID */ u16 alias;/* Alias Device ID */ -- 2.16.2 ___ iommu mailing list iommu@lists.linux-foundation.org https://lists.linuxfoundation.org/mailman/listinfo/iommu
[PATCH 01/10] iommu/amd: take into account that alloc_dev_data() may return NULL
find_dev_data() does not check whether the return value alloc_dev_data() is NULL. This was okay once because the pointer was returned once as-is. Since commit df3f7a6e8e85 ("iommu/amd: Use is_attach_deferred call-back") the pointer may be used within find_dev_data() so a NULL check is required. Cc: Baoquan HeFixes: df3f7a6e8e85 ("iommu/amd: Use is_attach_deferred call-back") Signed-off-by: Sebastian Andrzej Siewior --- drivers/iommu/amd_iommu.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/drivers/iommu/amd_iommu.c b/drivers/iommu/amd_iommu.c index 4cd19f93ca15..121c54f1c768 100644 --- a/drivers/iommu/amd_iommu.c +++ b/drivers/iommu/amd_iommu.c @@ -310,6 +310,8 @@ static struct iommu_dev_data *find_dev_data(u16 devid) if (dev_data == NULL) { dev_data = alloc_dev_data(devid); + if (!dev_data) + return NULL; if (translation_pre_enabled(iommu)) dev_data->defer_attach = true; -- 2.16.2 ___ iommu mailing list iommu@lists.linux-foundation.org https://lists.linuxfoundation.org/mailman/listinfo/iommu
[PATCH 00/10 v3] iommu/amd: lock splitting & GFP_KERNEL allocation
Hi, this is the rebase on top of iommu/x86/amd of my last series. It takes Scotts comments into consideration from my v2. It contains lock splitting and GFP_KERNEL allocation of remap-table. Mostly cleanup. The patches were boot tested on an AMD EPYC 7601. Sebastian ___ iommu mailing list iommu@lists.linux-foundation.org https://lists.linuxfoundation.org/mailman/listinfo/iommu
RE: [PATCH v2 2/2] drivers: remove force dma flag from buses
> -Original Message- > From: Christoph Hellwig [mailto:h...@lst.de] > Sent: Thursday, March 22, 2018 13:49 > To: Nipun Gupta> > > --- a/drivers/dma/qcom/hidma_mgmt.c > > +++ b/drivers/dma/qcom/hidma_mgmt.c > > @@ -398,7 +398,7 @@ static int __init > hidma_mgmt_of_populate_channels(struct device_node *np) > > } > > of_node_get(child); > > new_pdev->dev.of_node = child; > > - of_dma_configure(_pdev->dev, child); > > + of_dma_configure(_pdev->dev, child, true); > > /* > > * It is assumed that calling of_msi_configure is safe on > > * platforms with or without MSI support. > > Where did we mark this bus as force_dma before? I thought these devices to be on the platform bus as the device is of type 'struct platform_device', though I am not sure then why 'of_dma_configure()' is called here. Is this not on platform bus? > > > diff --git a/drivers/of/of_reserved_mem.c b/drivers/of/of_reserved_mem.c > > index 9a4f4246..895c83e 100644 > > --- a/drivers/of/of_reserved_mem.c > > +++ b/drivers/of/of_reserved_mem.c > > @@ -353,7 +353,7 @@ int of_reserved_mem_device_init_by_idx(struct device > *dev, > > /* ensure that dma_ops is set for virtual devices > > * using reserved memory > > */ > > - of_dma_configure(dev, np); > > + of_dma_configure(dev, np, true); > > Did all the callers of this one really force dma? I have a hard time > untangling the call stacks unfortunately. I see this API being called indirectly from NXP DPAA device driver which is for platform bus devices. So I marked 'true' out here. There are more places from where it is being called. Thanks, Nipun ___ iommu mailing list iommu@lists.linux-foundation.org https://lists.linuxfoundation.org/mailman/listinfo/iommu
RE: [PATCH v2 1/2] dma-mapping: move dma configuration to bus infrastructure
> -Original Message- > From: Christoph Hellwig [mailto:h...@lst.de] > Sent: Thursday, March 22, 2018 13:46 > To: Nipun Gupta> > > +static int amba_dma_configure(struct device *dev) > > +{ > > + return dma_common_configure(dev); > > +} > > So it turns out we only end with two callers of dma_common_configure > after this series. Based ont hat I'm tempted with the suggestion > from Robin to just have amba call platform_dma_configure, and move > the code from dma_common_configure to platform_dma_configure. okay, that would be fine, trivial query - will it be okay to include 'linux/platform_device.h' in the AMBA bus? I am reluctant for this change because of including platform file. Thanks, Nipun ___ iommu mailing list iommu@lists.linux-foundation.org https://lists.linuxfoundation.org/mailman/listinfo/iommu
[PATCH] iommu/vt-d, trivial: Remove unused variable
Unused after commit <42e8c186b595> ("iommu/vt-d: Simplify io/tlb flushing in intel_iommu_unmap"), cleanup it. Cc: David WoodhouseCc: Joerg Roedel Signed-off-by: Shaokun Zhang --- drivers/iommu/intel-iommu.c | 6 +- 1 file changed, 1 insertion(+), 5 deletions(-) diff --git a/drivers/iommu/intel-iommu.c b/drivers/iommu/intel-iommu.c index 582fd01..d49e0d3 100644 --- a/drivers/iommu/intel-iommu.c +++ b/drivers/iommu/intel-iommu.c @@ -5072,7 +5072,6 @@ static size_t intel_iommu_unmap(struct iommu_domain *domain, { struct dmar_domain *dmar_domain = to_dmar_domain(domain); struct page *freelist = NULL; - struct intel_iommu *iommu; unsigned long start_pfn, last_pfn; unsigned int npages; int iommu_id, level = 0; @@ -5091,12 +5090,9 @@ static size_t intel_iommu_unmap(struct iommu_domain *domain, npages = last_pfn - start_pfn + 1; - for_each_domain_iommu(iommu_id, dmar_domain) { - iommu = g_iommus[iommu_id]; - + for_each_domain_iommu(iommu_id, dmar_domain) iommu_flush_iotlb_psi(g_iommus[iommu_id], dmar_domain, start_pfn, npages, !freelist, 0); - } dma_free_pagelist(freelist); -- 1.9.1 ___ iommu mailing list iommu@lists.linux-foundation.org https://lists.linuxfoundation.org/mailman/listinfo/iommu
[PATCH v11 0/4] iommu/arm-smmu: Add runtime pm/sleep support
This series provides the support for turning on the arm-smmu's clocks/power domains using runtime pm. This is done using the recently introduced device links patches, which lets the smmu's runtime to follow the master's runtime pm, so the smmu remains powered only when the masters use it. As not all implementations support clock/power gating, we are checking for a valid 'smmu->dev's pm_domain' to conditionally enable the runtime power management for such smmu implementations that can support it. This series also adds support for Qcom's arm-smmu-v2 variant that has different clocks and power requirements. Took some reference from the exynos runtime patches [1]. With conditional runtime pm now, we avoid touching dev->power.lock in fastpaths for smmu implementations that don't need to do anything useful with pm_runtime. This lets us to use the much-argued pm_runtime_get_sync/put_sync() calls in map/unmap callbacks so that the clients do not have to worry about handling any of the arm-smmu's power. Previous version of this patch series is @ [5]. [v11] * Some more cleanups for device link. We don't need an explicit delete for device link from the driver, but just set the flag DL_FLAG_AUTOREMOVE. device_link_add() API description says - "If the DL_FLAG_AUTOREMOVE is set, the link will be removed automatically when the consumer device driver unbinds." * Addressed the comments for 'smmu' in arm_smmu_map/unmap(). * Dropped the patch [10] that introduced device_link_del_dev() API. [v10] * Introduce device_link_del_dev() API to delete the link between given consumer and supplier devices. The users of device link do not need to store link pointer to delete the link later. They can straightaway use this API by passing consumer and supplier devices. * Made corresponding changes to arm-smmu driver patch handling the device links. * Dropped the patch [9] that was adding device_link_find() API to device core layer. device_link_del_dev() serves the purpose to directly delete the link between two given devices. [v9] * Removed 'rpm_supported' flag, instead checking on pm_domain to enable runtime pm. * Creating device link only when the runtime pm is enabled, as we don't need a device link besides managing the power dependency between supplier and consumer devices. * Introducing a patch to add device_link_find() API that finds and existing link between supplier and consumer devices. Also, made necessary change to device_link_add() to use this API. * arm_smmu_remove_device() now uses this device_link_find() to find the device link between smmu device and the master device, and then delete this link. * Dropped the destroy_domain_context() fix [8] as it was rather, introducing catastrophically bad problem by destroying 'good dev's domain context. * Added 'Reviwed-by' tag for Tomasz's review. [v8] * Major change - - Added a flag 'rpm_supported' which each platform that supports runtime pm, can enable, and we enable runtime_pm over arm-smmu only when this flag is set. - Adding the conditional pm_runtime_get/put() calls to .map, .unmap and .attach_dev ops. - Dropped the patch [6] that exported pm_runtim_get/put_suupliers(), and also dropped the user driver patch [7] for these APIs. * Clock code further cleanup - doing only clk_bulk_enable() and clk_bulk_disable() in runtime pm callbacks. We shouldn't be taking a slow path (clk_prepare/unprepare()) from these runtime pm callbacks. Thereby, moved clk_bulk_prepare() to arm_smmu_device_probe(), and clk_bulk_unprepare() to arm_smmu_device_remove(). - clk data filling to a common method arm_smmu_fill_clk_data() that fills the clock ids and number of clocks. * Addressed other nits and comments - device_link_add() error path fixed. - Fix for checking negative error value from pm_runtime_get_sync(). - Documentation redo. * Added another patch fixing the error path in arm_smmu_attach_dev() to destroy allocated domain context. [v7] * Addressed review comments given by Robin Murphy - - Added device_link_del() in .remove_device path. - Error path cleanup in arm_smmu_add_device(). - Added pm_runtime_get/put_sync() in .remove path, and replaced pm_runtime_force_suspend() with pm_runtime_disable(). - clk_names cleanup in arm_smmu_init_clks() * Added 'Reviewed-by' given by Rob H. [V6] * Added Ack given by Rafael to first patch in the series. * Addressed Rob Herring's comment for adding soc specific compatible string as well besides 'qcom,smmu-v2'. [V5] * Dropped runtime pm calls from "arm_smmu_unmap" op as discussed over the list [3] for the last patch series. * Added a patch to export pm_runtime_get/put_suppliers() APIs to the series as agreed with Rafael [4]. *
[PATCH v11 1/4] iommu/arm-smmu: Add pm_runtime/sleep ops
From: Sricharan RThe smmu needs to be functional only when the respective master's using it are active. The device_link feature helps to track such functional dependencies, so that the iommu gets powered when the master device enables itself using pm_runtime. So by adapting the smmu driver for runtime pm, above said dependency can be addressed. This patch adds the pm runtime/sleep callbacks to the driver and also the functions to parse the smmu clocks from DT and enable them in resume/suspend. Signed-off-by: Sricharan R Signed-off-by: Archit Taneja [vivek: Clock rework to request bulk of clocks] Signed-off-by: Vivek Gautam Reviewed-by: Tomasz Figa --- drivers/iommu/arm-smmu.c | 60 ++-- 1 file changed, 58 insertions(+), 2 deletions(-) diff --git a/drivers/iommu/arm-smmu.c b/drivers/iommu/arm-smmu.c index 69e7c60792a8..d5873d545024 100644 --- a/drivers/iommu/arm-smmu.c +++ b/drivers/iommu/arm-smmu.c @@ -48,6 +48,7 @@ #include #include #include +#include #include #include @@ -205,6 +206,8 @@ struct arm_smmu_device { u32 num_global_irqs; u32 num_context_irqs; unsigned int*irqs; + struct clk_bulk_data*clks; + int num_clks; u32 cavium_id_base; /* Specific to Cavium */ @@ -1897,10 +1900,12 @@ static int arm_smmu_device_cfg_probe(struct arm_smmu_device *smmu) struct arm_smmu_match_data { enum arm_smmu_arch_version version; enum arm_smmu_implementation model; + const char * const *clks; + int num_clks; }; #define ARM_SMMU_MATCH_DATA(name, ver, imp)\ -static struct arm_smmu_match_data name = { .version = ver, .model = imp } +static const struct arm_smmu_match_data name = { .version = ver, .model = imp } ARM_SMMU_MATCH_DATA(smmu_generic_v1, ARM_SMMU_V1, GENERIC_SMMU); ARM_SMMU_MATCH_DATA(smmu_generic_v2, ARM_SMMU_V2, GENERIC_SMMU); @@ -1919,6 +1924,23 @@ static const struct of_device_id arm_smmu_of_match[] = { }; MODULE_DEVICE_TABLE(of, arm_smmu_of_match); +static void arm_smmu_fill_clk_data(struct arm_smmu_device *smmu, + const char * const *clks) +{ + int i; + + if (smmu->num_clks < 1) + return; + + smmu->clks = devm_kcalloc(smmu->dev, smmu->num_clks, + sizeof(*smmu->clks), GFP_KERNEL); + if (!smmu->clks) + return; + + for (i = 0; i < smmu->num_clks; i++) + smmu->clks[i].id = clks[i]; +} + #ifdef CONFIG_ACPI static int acpi_smmu_get_data(u32 model, struct arm_smmu_device *smmu) { @@ -2001,6 +2023,9 @@ static int arm_smmu_device_dt_probe(struct platform_device *pdev, data = of_device_get_match_data(dev); smmu->version = data->version; smmu->model = data->model; + smmu->num_clks = data->num_clks; + + arm_smmu_fill_clk_data(smmu, data->clks); parse_driver_options(smmu); @@ -2099,6 +2124,14 @@ static int arm_smmu_device_probe(struct platform_device *pdev) smmu->irqs[i] = irq; } + err = devm_clk_bulk_get(smmu->dev, smmu->num_clks, smmu->clks); + if (err) + return err; + + err = clk_bulk_prepare(smmu->num_clks, smmu->clks); + if (err) + return err; + err = arm_smmu_device_cfg_probe(smmu); if (err) return err; @@ -2181,6 +2214,9 @@ static int arm_smmu_device_remove(struct platform_device *pdev) /* Turn the thing off */ writel(sCR0_CLIENTPD, ARM_SMMU_GR0_NS(smmu) + ARM_SMMU_GR0_sCR0); + + clk_bulk_unprepare(smmu->num_clks, smmu->clks); + return 0; } @@ -2197,7 +2233,27 @@ static int __maybe_unused arm_smmu_pm_resume(struct device *dev) return 0; } -static SIMPLE_DEV_PM_OPS(arm_smmu_pm_ops, NULL, arm_smmu_pm_resume); +static int __maybe_unused arm_smmu_runtime_resume(struct device *dev) +{ + struct arm_smmu_device *smmu = dev_get_drvdata(dev); + + return clk_bulk_enable(smmu->num_clks, smmu->clks); +} + +static int __maybe_unused arm_smmu_runtime_suspend(struct device *dev) +{ + struct arm_smmu_device *smmu = dev_get_drvdata(dev); + + clk_bulk_disable(smmu->num_clks, smmu->clks); + + return 0; +} + +static const struct dev_pm_ops arm_smmu_pm_ops = { + SET_SYSTEM_SLEEP_PM_OPS(NULL, arm_smmu_pm_resume) + SET_RUNTIME_PM_OPS(arm_smmu_runtime_suspend, + arm_smmu_runtime_resume, NULL) +}; static struct platform_driver arm_smmu_driver = { .driver = { -- QUALCOMM INDIA, on behalf of Qualcomm Innovation Center, Inc. is a member of Code Aurora Forum, hosted by The Linux Foundation
[PATCH v11 3/4] iommu/arm-smmu: Add the device_link between masters and smmu
From: Sricharan RFinally add the device link between the master device and smmu, so that the smmu gets runtime enabled/disabled only when the master needs it. This is done from add_device callback which gets called once when the master is added to the smmu. Signed-off-by: Sricharan R Signed-off-by: Vivek Gautam Reviewed-by: Tomasz Figa --- drivers/iommu/arm-smmu.c | 12 1 file changed, 12 insertions(+) diff --git a/drivers/iommu/arm-smmu.c b/drivers/iommu/arm-smmu.c index 3383c6657857..da50fd979d5f 100644 --- a/drivers/iommu/arm-smmu.c +++ b/drivers/iommu/arm-smmu.c @@ -1461,8 +1461,20 @@ static int arm_smmu_add_device(struct device *dev) iommu_device_link(>iommu, dev); + if (pm_runtime_enabled(smmu->dev) && + !device_link_add(dev, smmu->dev, +DL_FLAG_PM_RUNTIME | DL_FLAG_AUTOREMOVE)) { + dev_err(smmu->dev, "Unable to add link to the consumer %s\n", + dev_name(dev)); + ret = -ENODEV; + goto out_unlink; + } + return 0; +out_unlink: + iommu_device_unlink(>iommu, dev); + arm_smmu_master_free_smes(fwspec); out_cfg_free: kfree(cfg); out_free: -- QUALCOMM INDIA, on behalf of Qualcomm Innovation Center, Inc. is a member of Code Aurora Forum, hosted by The Linux Foundation ___ iommu mailing list iommu@lists.linux-foundation.org https://lists.linuxfoundation.org/mailman/listinfo/iommu
[PATCH v11 4/4] iommu/arm-smmu: Add support for qcom,smmu-v2 variant
qcom,smmu-v2 is an arm,smmu-v2 implementation with specific clock and power requirements. This smmu core is used with multiple masters on msm8996, viz. mdss, video, etc. Add bindings for the same. Signed-off-by: Vivek GautamReviewed-by: Rob Herring Reviewed-by: Tomasz Figa --- .../devicetree/bindings/iommu/arm,smmu.txt | 42 ++ drivers/iommu/arm-smmu.c | 14 2 files changed, 56 insertions(+) diff --git a/Documentation/devicetree/bindings/iommu/arm,smmu.txt b/Documentation/devicetree/bindings/iommu/arm,smmu.txt index 8a6ffce12af5..7c71a6ed465a 100644 --- a/Documentation/devicetree/bindings/iommu/arm,smmu.txt +++ b/Documentation/devicetree/bindings/iommu/arm,smmu.txt @@ -17,10 +17,19 @@ conditions. "arm,mmu-401" "arm,mmu-500" "cavium,smmu-v2" +"qcom,-smmu-v2", "qcom,smmu-v2" depending on the particular implementation and/or the version of the architecture implemented. + A number of Qcom SoCs use qcom,smmu-v2 version of the IP. + "qcom,-smmu-v2" represents a soc specific compatible + string that should be present along with the "qcom,smmu-v2" + to facilitate SoC specific clocks/power connections and to + address specific bug fixes. + An example string would be - + "qcom,msm8996-smmu-v2", "qcom,smmu-v2". + - reg : Base address and size of the SMMU. - #global-interrupts : The number of global interrupts exposed by the @@ -71,6 +80,22 @@ conditions. or using stream matching with #iommu-cells = <2>, and may be ignored if present in such cases. +- clock-names:List of the names of clocks input to the device. The + required list depends on particular implementation and + is as follows: + - for "qcom,smmu-v2": +- "bus": clock required for downstream bus access and + for the smmu ptw, +- "iface": clock required to access smmu's registers + through the TCU's programming interface. + - unspecified for other implementations. + +- clocks: Specifiers for all clocks listed in the clock-names property, + as per generic clock bindings. + +- power-domains: Specifiers for power domains required to be powered on for + the SMMU to operate, as per generic power domain bindings. + ** Deprecated properties: - mmu-masters (deprecated in favour of the generic "iommus" binding) : @@ -137,3 +162,20 @@ conditions. iommu-map = <0 0 0x400>; ... }; + + /* Qcom's arm,smmu-v2 implementation */ + smmu4: iommu { + compatible = "qcom,msm8996-smmu-v2", "qcom,smmu-v2"; + reg = <0xd0 0x1>; + + #global-interrupts = <1>; + interrupts = , +, +; + #iommu-cells = <1>; + power-domains = < MDSS_GDSC>; + + clocks = < SMMU_MDP_AXI_CLK>, +< SMMU_MDP_AHB_CLK>; + clock-names = "bus", "iface"; + }; diff --git a/drivers/iommu/arm-smmu.c b/drivers/iommu/arm-smmu.c index da50fd979d5f..ca3559ea115b 100644 --- a/drivers/iommu/arm-smmu.c +++ b/drivers/iommu/arm-smmu.c @@ -119,6 +119,7 @@ enum arm_smmu_implementation { GENERIC_SMMU, ARM_MMU500, CAVIUM_SMMUV2, + QCOM_SMMUV2, }; struct arm_smmu_s2cr { @@ -1980,6 +1981,17 @@ ARM_SMMU_MATCH_DATA(arm_mmu401, ARM_SMMU_V1_64K, GENERIC_SMMU); ARM_SMMU_MATCH_DATA(arm_mmu500, ARM_SMMU_V2, ARM_MMU500); ARM_SMMU_MATCH_DATA(cavium_smmuv2, ARM_SMMU_V2, CAVIUM_SMMUV2); +static const char * const qcom_smmuv2_clks[] = { + "bus", "iface", +}; + +static const struct arm_smmu_match_data qcom_smmuv2 = { + .version = ARM_SMMU_V2, + .model = QCOM_SMMUV2, + .clks = qcom_smmuv2_clks, + .num_clks = ARRAY_SIZE(qcom_smmuv2_clks), +}; + static const struct of_device_id arm_smmu_of_match[] = { { .compatible = "arm,smmu-v1", .data = _generic_v1 }, { .compatible = "arm,smmu-v2", .data = _generic_v2 }, @@ -1987,6 +1999,7 @@ static const struct of_device_id arm_smmu_of_match[] = { { .compatible = "arm,mmu-401", .data = _mmu401 }, { .compatible = "arm,mmu-500", .data = _mmu500 }, { .compatible = "cavium,smmu-v2", .data = _smmuv2 }, + { .compatible = "qcom,smmu-v2", .data = _smmuv2 }, { }, }; MODULE_DEVICE_TABLE(of, arm_smmu_of_match); @@ -2361,6 +2374,7 @@ IOMMU_OF_DECLARE(arm_mmu400, "arm,mmu-400");
[PATCH v11 2/4] iommu/arm-smmu: Invoke pm_runtime during probe, add/remove device
From: Sricharan RThe smmu device probe/remove and add/remove master device callbacks gets called when the smmu is not linked to its master, that is without the context of the master device. So calling runtime apis in those places separately. Signed-off-by: Sricharan R [vivek: Cleanup pm runtime calls] Signed-off-by: Vivek Gautam Reviewed-by: Tomasz Figa --- drivers/iommu/arm-smmu.c | 92 +++- 1 file changed, 84 insertions(+), 8 deletions(-) diff --git a/drivers/iommu/arm-smmu.c b/drivers/iommu/arm-smmu.c index d5873d545024..3383c6657857 100644 --- a/drivers/iommu/arm-smmu.c +++ b/drivers/iommu/arm-smmu.c @@ -268,6 +268,20 @@ static struct arm_smmu_option_prop arm_smmu_options[] = { { 0, NULL}, }; +static inline int arm_smmu_rpm_get(struct arm_smmu_device *smmu) +{ + if (pm_runtime_enabled(smmu->dev)) + return pm_runtime_get_sync(smmu->dev); + + return 0; +} + +static inline void arm_smmu_rpm_put(struct arm_smmu_device *smmu) +{ + if (pm_runtime_enabled(smmu->dev)) + pm_runtime_put(smmu->dev); +} + static struct arm_smmu_domain *to_smmu_domain(struct iommu_domain *dom) { return container_of(dom, struct arm_smmu_domain, domain); @@ -913,11 +927,15 @@ static void arm_smmu_destroy_domain_context(struct iommu_domain *domain) struct arm_smmu_domain *smmu_domain = to_smmu_domain(domain); struct arm_smmu_device *smmu = smmu_domain->smmu; struct arm_smmu_cfg *cfg = _domain->cfg; - int irq; + int ret, irq; if (!smmu || domain->type == IOMMU_DOMAIN_IDENTITY) return; + ret = arm_smmu_rpm_get(smmu); + if (ret < 0) + return; + /* * Disable the context bank and free the page tables before freeing * it. @@ -932,6 +950,8 @@ static void arm_smmu_destroy_domain_context(struct iommu_domain *domain) free_io_pgtable_ops(smmu_domain->pgtbl_ops); __arm_smmu_free_bitmap(smmu->context_map, cfg->cbndx); + + arm_smmu_rpm_put(smmu); } static struct iommu_domain *arm_smmu_domain_alloc(unsigned type) @@ -1213,10 +1233,15 @@ static int arm_smmu_attach_dev(struct iommu_domain *domain, struct device *dev) return -ENODEV; smmu = fwspec_smmu(fwspec); + + ret = arm_smmu_rpm_get(smmu); + if (ret < 0) + return ret; + /* Ensure that the domain is finalised */ ret = arm_smmu_init_domain_context(domain, smmu); if (ret < 0) - return ret; + goto rpm_put; /* * Sanity check the domain. We don't support domains across @@ -1226,33 +1251,50 @@ static int arm_smmu_attach_dev(struct iommu_domain *domain, struct device *dev) dev_err(dev, "cannot attach to SMMU %s whilst already attached to domain on SMMU %s\n", dev_name(smmu_domain->smmu->dev), dev_name(smmu->dev)); - return -EINVAL; + ret = -EINVAL; + goto rpm_put; } /* Looks ok, so add the device to the domain */ - return arm_smmu_domain_add_master(smmu_domain, fwspec); + ret = arm_smmu_domain_add_master(smmu_domain, fwspec); + +rpm_put: + arm_smmu_rpm_put(smmu); + return ret; } static int arm_smmu_map(struct iommu_domain *domain, unsigned long iova, phys_addr_t paddr, size_t size, int prot) { struct io_pgtable_ops *ops = to_smmu_domain(domain)->pgtbl_ops; + struct arm_smmu_device *smmu = to_smmu_domain(domain)->smmu; + int ret; if (!ops) return -ENODEV; - return ops->map(ops, iova, paddr, size, prot); + arm_smmu_rpm_get(smmu); + ret = ops->map(ops, iova, paddr, size, prot); + arm_smmu_rpm_put(smmu); + + return ret; } static size_t arm_smmu_unmap(struct iommu_domain *domain, unsigned long iova, size_t size) { struct io_pgtable_ops *ops = to_smmu_domain(domain)->pgtbl_ops; + struct arm_smmu_device *smmu = to_smmu_domain(domain)->smmu; + size_t ret; if (!ops) return 0; - return ops->unmap(ops, iova, size); + arm_smmu_rpm_get(smmu); + ret = ops->unmap(ops, iova, size); + arm_smmu_rpm_put(smmu); + + return ret; } static void arm_smmu_iotlb_sync(struct iommu_domain *domain) @@ -1407,7 +1449,13 @@ static int arm_smmu_add_device(struct device *dev) while (i--) cfg->smendx[i] = INVALID_SMENDX; + ret = arm_smmu_rpm_get(smmu); + if (ret < 0) + goto out_cfg_free; + ret = arm_smmu_master_alloc_smes(dev); + arm_smmu_rpm_put(smmu); + if (ret) goto out_cfg_free; @@ -1427,7 +1475,7
RE: [PATCH 1/4] iommu: Add virtio-iommu driver
> From: Robin Murphy [mailto:robin.mur...@arm.com] > Sent: Wednesday, March 21, 2018 10:24 PM > > On 21/03/18 13:14, Jean-Philippe Brucker wrote: > > On 21/03/18 06:43, Tian, Kevin wrote: > > [...] > >>> + > >>> +#include > >>> + > >>> +#define MSI_IOVA_BASE0x800 > >>> +#define MSI_IOVA_LENGTH 0x10 > >> > >> this is ARM specific, and according to virtio-iommu spec isn't it > >> better probed on the endpoint instead of hard-coding here? > > > > These values are arbitrary, not really ARM-specific even if ARM is the > > only user yet: we're just reserving a random IOVA region for mapping > MSIs. > > It is hard-coded because of the way iommu-dma.c works, but I don't > quite > > remember why that allocation isn't dynamic. > > The host kernel needs to have *some* MSI region in place before the > guest can start configuring interrupts, otherwise it won't know what > address to give to the underlying hardware. However, as soon as the host > kernel has picked a region, host userspace needs to know that it can no > longer use addresses in that region for DMA-able guest memory. It's a > lot easier when the address is fixed in hardware and the host userspace > will never be stupid enough to try and VFIO_IOMMU_DMA_MAP it, but in > the > more general case where MSI writes undergo IOMMU address translation > so > it's an arbitrary IOVA, this has the potential to conflict with stuff > like guest memory hotplug. > > What we currently have is just the simplest option, with the host kernel > just picking something up-front and pretending to host userspace that > it's a fixed hardware address. There's certainly scope for it to be a > bit more dynamic in the sense of adding an interface to let userspace > move it around (before attaching any devices, at least), but I don't > think it's feasible for the host kernel to second-guess userspace enough > to make it entirely transparent like it is in the DMA API domain case. > > Of course, that's all assuming the host itself is using a virtio-iommu > (e.g. in a nested virt or emulation scenario). When it's purely within a > guest then an MSI reservation shouldn't matter so much, since the guest > won't be anywhere near the real hardware configuration anyway. > > Robin. Curious since anyway we are defining a new iommu architecture is it possible to avoid those ARM-specific burden completely? Thanks Kevin ___ iommu mailing list iommu@lists.linux-foundation.org https://lists.linuxfoundation.org/mailman/listinfo/iommu
RE: [PATCH v5 2/7] vfio/type1: Check reserve region conflict and update iova list
> -Original Message- > From: Alex Williamson [mailto:alex.william...@redhat.com] > Sent: Wednesday, March 21, 2018 4:31 PM > To: Tian, Kevin> Cc: Shameerali Kolothum Thodi ; > eric.au...@redhat.com; pmo...@linux.vnet.ibm.com; k...@vger.kernel.org; > linux-ker...@vger.kernel.org; xuwei (O) ; Linuxarm > ; iommu@lists.linux-foundation.org > Subject: Re: [PATCH v5 2/7] vfio/type1: Check reserve region conflict and > update iova list > > On Wed, 21 Mar 2018 03:30:29 + > "Tian, Kevin" wrote: > > > > From: Alex Williamson [mailto:alex.william...@redhat.com] > > > Sent: Wednesday, March 21, 2018 6:38 AM > > > > > > On Mon, 19 Mar 2018 07:51:58 + > > > "Tian, Kevin" wrote: > > > > > > > > From: Shameer Kolothum > > > > > Sent: Friday, March 16, 2018 12:35 AM > > > > > > > > > > This retrieves the reserved regions associated with dev group and > > > > > checks for conflicts with any existing dma mappings. Also update > > > > > the iova list excluding the reserved regions. > > > > > > > > > > Signed-off-by: Shameer Kolothum > > > > > > > > > > --- > > > > > drivers/vfio/vfio_iommu_type1.c | 90 > > > > > + > > > > > 1 file changed, 90 insertions(+) > > > > > > > > > > diff --git a/drivers/vfio/vfio_iommu_type1.c > > > > > b/drivers/vfio/vfio_iommu_type1.c > > > > > index 1123c74..cfe2bb2 100644 > > > > > --- a/drivers/vfio/vfio_iommu_type1.c > > > > > +++ b/drivers/vfio/vfio_iommu_type1.c > > > > > @@ -1313,6 +1313,82 @@ static int vfio_iommu_aper_resize(struct > > > > > list_head *iova, > > > > > return 0; > > > > > } > > > > > > > > > > +/* > > > > > + * Check reserved region conflicts with existing dma mappings > > > > > + */ > > > > > +static bool vfio_iommu_resv_conflict(struct vfio_iommu *iommu, > > > > > + struct list_head *resv_regions) > > > > > +{ > > > > > + struct iommu_resv_region *region; > > > > > + > > > > > + /* Check for conflict with existing dma mappings */ > > > > > + list_for_each_entry(region, resv_regions, list) { > > > > > + if (vfio_find_dma(iommu, region->start, region- > >length)) > > > > > + return true; > > > > > + } > > > > > + > > > > > + return false; > > > > > +} > > > > > + > > > > > +/* > > > > > + * Check iova region overlap with reserved regions and > > > > > + * exclude them from the iommu iova range > > > > > + */ > > > > > +static int vfio_iommu_resv_exclude(struct list_head *iova, > > > > > + struct list_head *resv_regions) > > > > > +{ > > > > > + struct iommu_resv_region *resv; > > > > > + struct vfio_iova *n, *next; > > > > > + > > > > > + list_for_each_entry(resv, resv_regions, list) { > > > > > + phys_addr_t start, end; > > > > > + > > > > > + start = resv->start; > > > > > + end = resv->start + resv->length - 1; > > > > > + > > > > > + list_for_each_entry_safe(n, next, iova, list) { > > > > > + int ret = 0; > > > > > + > > > > > + /* No overlap */ > > > > > + if ((start > n->end) || (end < n->start)) > > > > > + continue; > > > > > + /* > > > > > + * Insert a new node if current node overlaps > with > > > > > the > > > > > + * reserve region to exlude that from valid iova > > > > > range. > > > > > + * Note that, new node is inserted before the > > > > > current > > > > > + * node and finally the current node is deleted > > > > > keeping > > > > > + * the list updated and sorted. > > > > > + */ > > > > > + if (start > n->start) > > > > > + ret = vfio_iommu_iova_insert(>list, > > > > > + n->start, start > - 1); > > > > > + if (!ret && end < n->end) > > > > > + ret = vfio_iommu_iova_insert(>list, > > > > > + end + 1, n- > >end); > > > > > + if (ret) > > > > > + return ret; > > > > > > > > Is it safer to delete the 1st node here in case of failure of the 2nd > > > > node? > > > > There is no problem with current logic since upon error iova_copy will > > > > be released anyway. However this function alone doesn't assume the > > > > fact of a temporary list, thus it's better to keep the list clean w/o > > > > garbage > > > > left from any error handling. > > > > > > I don't think the proposal makes the list notably more sane on failure > > > than we have here. If the
RE: [PATCH v5 0/7] vfio/type1: Add support for valid iova list management
> From: Alex Williamson > Sent: Thursday, March 22, 2018 1:11 AM > > On Wed, 21 Mar 2018 03:28:16 + > "Tian, Kevin"wrote: > > > > From: Alex Williamson [mailto:alex.william...@redhat.com] > > > Sent: Wednesday, March 21, 2018 6:55 AM > > > > > > On Mon, 19 Mar 2018 08:28:32 + > > > "Tian, Kevin" wrote: > > > > > > > > From: Shameer Kolothum > > > > > Sent: Friday, March 16, 2018 12:35 AM > > > > > > > > > > This series introduces an iova list associated with a vfio > > > > > iommu. The list is kept updated taking care of iommu apertures, > > > > > and reserved regions. Also this series adds checks for any conflict > > > > > with existing dma mappings whenever a new device group is > attached > > > to > > > > > the domain. > > > > > > > > > > User-space can retrieve valid iova ranges using > > > VFIO_IOMMU_GET_INFO > > > > > ioctl capability chains. Any dma map request outside the valid iova > > > > > range will be rejected. > > > > > > > > GET_INFO is done at initialization time which is good for cold attached > > > > devices. If a hotplugged device may cause change of valid iova ranges > > > > at run-time, then there could be potential problem (which however is > > > > difficult for user space or orchestration stack to figure out in > > > > advance) > > > > Can we do some extension like below to make hotplug case cleaner? > > > > > > Let's be clear what we mean by hotplug here, as I see it, the only > > > relevant hotplug would be a physical device, hot added to the host, > > > which becomes a member of an existing, in-use IOMMU group. If, on > the > > > other hand, we're talking about hotplug related to the user process, > > > there's nothing asynchronous about that. For instance in the QEMU > > > case, QEMU must add the group to the container, at which point it can > > > evaluate the new iova list and remove the group from the container if > > > it doesn't like the result. So what would be a case of the available > > > iova list for a group changing as a result of adding a device? > > > > My original thought was about the latter case. At that moment > > I was not sure whether the window between adding/removing > > the group may cause some issue if there are right some IOVA > > allocations happening in parallel. But looks Qemu can anyway > > handle it well as long as such scenario is considered. > > I believe the kernel patches here and the existing code are using > locking to prevent races between mapping changes and device changes, so > the acceptance of a new group into a container and the iova list for a > container should always be correct for the order these operations > arrive. Beyond that, I don't believe it's the kernel's responsibility > to do anything more than block groups from being added if they conflict > with current mappings. The user already has the minimum interfaces > they need to manage other scenarios. Agree > > > > > - An interface allowing user space to request VFIO rejecting further > > > > attach_group if doing so may cause iova range change. e.g. Qemu can > > > > do such request once completing initial GET_INFO; > > > > > > For the latter case above, it seems unnecessary, QEMU can revert the > > > attach, we're only promising that the attach won't interfere with > > > existing mappings. For the host hotplug case, the user has no control, > > > the new device is a member of the iommu group and therefore > necessarily > > > becomes a part of container. I imagine there are plenty of other holes > > > in this scenario already. > > > > > > > - or an event notification to user space upon change of valid iova > > > > ranges when attaching a new device at run-time. It goes one step > > > > further - even attach may cause iova range change, it may still > > > > succeed as long as Qemu hasn't allocated any iova in impacted > > > > range > > > > > > Same as above, in the QEMU hotplug case, the user is orchestrating the > > > adding of the group to the container, they can then check the iommu > > > info on their own and determine what, if any, changes are relevant to > > > them, knowing that the addition would not succeed if any current > > > mappings where affected. In the host case, a notification would be > > > required, but we'd first need to identify exactly how the iova list can > > > change asynchronous to the user doing anything. Thanks, > > > > for host hotplug, possibly notification could be an opt-in model. > > meaning if user space doesn't explicitly request receiving notification > > on such event, the device is just left in unused state (vfio-pci still > > claims the device, assuming it assigned to the container owner, but > > the owner doesn't use it) > > Currently, if a device is added to a live group, the kernel will print > a warning. We have a todo to bind that device to a vfio-bus driver, > but I'm not sure that isn't an overreach into the system policy > decisions. If system policy decides to
Re: [PATCH v2 2/2] drivers: remove force dma flag from buses
> --- a/drivers/dma/qcom/hidma_mgmt.c > +++ b/drivers/dma/qcom/hidma_mgmt.c > @@ -398,7 +398,7 @@ static int __init hidma_mgmt_of_populate_channels(struct > device_node *np) > } > of_node_get(child); > new_pdev->dev.of_node = child; > - of_dma_configure(_pdev->dev, child); > + of_dma_configure(_pdev->dev, child, true); > /* >* It is assumed that calling of_msi_configure is safe on >* platforms with or without MSI support. Where did we mark this bus as force_dma before? > diff --git a/drivers/of/of_reserved_mem.c b/drivers/of/of_reserved_mem.c > index 9a4f4246..895c83e 100644 > --- a/drivers/of/of_reserved_mem.c > +++ b/drivers/of/of_reserved_mem.c > @@ -353,7 +353,7 @@ int of_reserved_mem_device_init_by_idx(struct device *dev, > /* ensure that dma_ops is set for virtual devices >* using reserved memory >*/ > - of_dma_configure(dev, np); > + of_dma_configure(dev, np, true); Did all the callers of this one really force dma? I have a hard time untangling the call stacks unfortunately. ___ iommu mailing list iommu@lists.linux-foundation.org https://lists.linuxfoundation.org/mailman/listinfo/iommu
Re: [PATCH v2 1/2] dma-mapping: move dma configuration to bus infrastructure
> > +int dma_configure(struct device *dev) > > +{ > > + if (dev->bus->dma_configure) > > + return dev->bus->dma_configure(dev); > > What if dma_common_configure() is called in case "bus->dma_configure" is not > defined? Then we'd still have a dependency of common code on OF and ACPI. On the other hand we'd get free OF and ACPI dma ranges parsing for everyone, which might be handy. And which would really help mitigating the risk that we missed some bus that gets dma configuration from OF, so maybe it actually is a good idea. I'd just rename it to dma_default_configure or similar in that case. ___ iommu mailing list iommu@lists.linux-foundation.org https://lists.linuxfoundation.org/mailman/listinfo/iommu
Re: [PATCH v2 1/2] dma-mapping: move dma configuration to bus infrastructure
> +static int amba_dma_configure(struct device *dev) > +{ > + return dma_common_configure(dev); > +} So it turns out we only end with two callers of dma_common_configure after this series. Based ont hat I'm tempted with the suggestion from Robin to just have amba call platform_dma_configure, and move the code from dma_common_configure to platform_dma_configure. > +int dma_configure(struct device *dev) > +{ > + if (dev->bus->dma_configure) > + return dev->bus->dma_configure(dev); > + > + return 0; > +} > void dma_deconfigure(struct device *dev) As grep pointed out this wants a blank line after the function, and while in nitpicking mode I'd suggest to remove the blank line above the return statement while at it. > diff --git a/drivers/gpu/host1x/bus.c b/drivers/gpu/host1x/bus.c > index 88a3558..fa9896d 100644 > --- a/drivers/gpu/host1x/bus.c > +++ b/drivers/gpu/host1x/bus.c > @@ -314,6 +314,14 @@ static int host1x_device_match(struct device *dev, > struct device_driver *drv) > return strcmp(dev_name(dev), drv->name) == 0; > } > > +static int host1x_dma_configure(struct device *dev) > +{ > + if (dev->of_node) > + return of_dma_configure(dev, dev->of_node); > + > + return 0; Same here. > + */ > +static int pci_dma_configure(struct device *dev) > +{ > + struct device *bridge; > + enum dev_dma_attr attr; > + int ret = 0; > + > + bridge = pci_get_host_bridge_device(to_pci_dev(dev)); > + > + if (IS_ENABLED(CONFIG_OF) && bridge->parent && > + bridge->parent->of_node) { > + ret = of_dma_configure(dev, bridge->parent->of_node); > + } else if (has_acpi_companion(bridge)) { > + attr = acpi_get_dma_attr(to_acpi_device_node(bridge->fwnode)); > + if (attr != DEV_DMA_NOT_SUPPORTED) > + ret = acpi_dma_configure(dev, attr); > + } The attr declaration can be moved into the inner scope here. > + pci_put_host_bridge_device(bridge); > + > + return ret; Drop the blank line after the return, please. ___ iommu mailing list iommu@lists.linux-foundation.org https://lists.linuxfoundation.org/mailman/listinfo/iommu