[PATCH v3 15/16] iommu/vt-d: Support flushing more translation cache types

2019-05-03 Thread Jacob Pan
When Shared Virtual Memory is exposed to a guest via vIOMMU, scalable
IOTLB invalidation may be passed down from outside IOMMU subsystems.
This patch adds invalidation functions that can be used for additional
translation cache types.

Signed-off-by: Jacob Pan 
---
 drivers/iommu/dmar.c| 50 +
 include/linux/intel-iommu.h | 21 +++
 2 files changed, 67 insertions(+), 4 deletions(-)

diff --git a/drivers/iommu/dmar.c b/drivers/iommu/dmar.c
index 9c49300..46ad701 100644
--- a/drivers/iommu/dmar.c
+++ b/drivers/iommu/dmar.c
@@ -1357,6 +1357,21 @@ void qi_flush_iotlb(struct intel_iommu *iommu, u16 did, 
u64 addr,
qi_submit_sync(, iommu);
 }
 
+/* PASID-based IOTLB Invalidate */
+void qi_flush_piotlb(struct intel_iommu *iommu, u16 did, u64 addr, u32 pasid,
+   unsigned int size_order, u64 granu, int ih)
+{
+   struct qi_desc desc;
+
+   desc.qw0 = QI_EIOTLB_PASID(pasid) | QI_EIOTLB_DID(did) |
+   QI_EIOTLB_GRAN(granu) | QI_EIOTLB_TYPE;
+   desc.qw1 = QI_EIOTLB_ADDR(addr) | QI_EIOTLB_IH(ih) |
+   QI_EIOTLB_AM(size_order);
+   desc.qw2 = 0;
+   desc.qw3 = 0;
+   qi_submit_sync(, iommu);
+}
+
 void qi_flush_dev_iotlb(struct intel_iommu *iommu, u16 sid, u16 pfsid,
u16 qdep, u64 addr, unsigned mask)
 {
@@ -1380,6 +1395,41 @@ void qi_flush_dev_iotlb(struct intel_iommu *iommu, u16 
sid, u16 pfsid,
qi_submit_sync(, iommu);
 }
 
+/* PASID-based device IOTLB Invalidate */
+void qi_flush_dev_piotlb(struct intel_iommu *iommu, u16 sid, u16 pfsid,
+   u32 pasid,  u16 qdep, u64 addr, unsigned size, u64 granu)
+{
+   struct qi_desc desc;
+
+   desc.qw0 = QI_DEV_EIOTLB_PASID(pasid) | QI_DEV_EIOTLB_SID(sid) |
+   QI_DEV_EIOTLB_QDEP(qdep) | QI_DEIOTLB_TYPE |
+   QI_DEV_IOTLB_PFSID(pfsid);
+   desc.qw1 = QI_DEV_EIOTLB_GLOB(granu);
+
+   /* If S bit is 0, we only flush a single page. If S bit is set,
+* The least significant zero bit indicates the size. VT-d spec
+* 6.5.2.6
+*/
+   if (!size)
+   desc.qw0 |= QI_DEV_EIOTLB_ADDR(addr) & ~QI_DEV_EIOTLB_SIZE;
+   else {
+   unsigned long mask = 1UL << (VTD_PAGE_SHIFT + size);
+
+   desc.qw1 |= QI_DEV_EIOTLB_ADDR(addr & ~mask) | 
QI_DEV_EIOTLB_SIZE;
+   }
+   qi_submit_sync(, iommu);
+}
+
+void qi_flush_pasid_cache(struct intel_iommu *iommu, u16 did, u64 granu, int 
pasid)
+{
+   struct qi_desc desc;
+
+   desc.qw0 = QI_PC_TYPE | QI_PC_DID(did) | QI_PC_GRAN(granu) | 
QI_PC_PASID(pasid);
+   desc.qw1 = 0;
+   desc.qw2 = 0;
+   desc.qw3 = 0;
+   qi_submit_sync(, iommu);
+}
 /*
  * Disable Queued Invalidation interface.
  */
diff --git a/include/linux/intel-iommu.h b/include/linux/intel-iommu.h
index 774f368..6b6522d 100644
--- a/include/linux/intel-iommu.h
+++ b/include/linux/intel-iommu.h
@@ -339,7 +339,7 @@ enum {
 #define QI_IOTLB_GRAN(gran)(((u64)gran) >> (DMA_TLB_FLUSH_GRANU_OFFSET-4))
 #define QI_IOTLB_ADDR(addr)(((u64)addr) & VTD_PAGE_MASK)
 #define QI_IOTLB_IH(ih)(((u64)ih) << 6)
-#define QI_IOTLB_AM(am)(((u8)am))
+#define QI_IOTLB_AM(am)(((u8)am) & 0x3f)
 
 #define QI_CC_FM(fm)   (((u64)fm) << 48)
 #define QI_CC_SID(sid) (((u64)sid) << 32)
@@ -357,17 +357,22 @@ enum {
 #define QI_PC_DID(did) (((u64)did) << 16)
 #define QI_PC_GRAN(gran)   (((u64)gran) << 4)
 
-#define QI_PC_ALL_PASIDS   (QI_PC_TYPE | QI_PC_GRAN(0))
-#define QI_PC_PASID_SEL(QI_PC_TYPE | QI_PC_GRAN(1))
+/* PASID cache invalidation granu */
+#define QI_PC_ALL_PASIDS   0
+#define QI_PC_PASID_SEL1
 
 #define QI_EIOTLB_ADDR(addr)   ((u64)(addr) & VTD_PAGE_MASK)
 #define QI_EIOTLB_GL(gl)   (((u64)gl) << 7)
 #define QI_EIOTLB_IH(ih)   (((u64)ih) << 6)
-#define QI_EIOTLB_AM(am)   (((u64)am))
+#define QI_EIOTLB_AM(am)   (((u64)am) & 0x3f)
 #define QI_EIOTLB_PASID(pasid) (((u64)pasid) << 32)
 #define QI_EIOTLB_DID(did) (((u64)did) << 16)
 #define QI_EIOTLB_GRAN(gran)   (((u64)gran) << 4)
 
+/* QI Dev-IOTLB inv granu */
+#define QI_DEV_IOTLB_GRAN_ALL  1
+#define QI_DEV_IOTLB_GRAN_PASID_SEL0
+
 #define QI_DEV_EIOTLB_ADDR(a)  ((u64)(a) & VTD_PAGE_MASK)
 #define QI_DEV_EIOTLB_SIZE (((u64)1) << 11)
 #define QI_DEV_EIOTLB_GLOB(g)  ((u64)g)
@@ -658,8 +663,16 @@ extern void qi_flush_context(struct intel_iommu *iommu, 
u16 did, u16 sid,
 u8 fm, u64 type);
 extern void qi_flush_iotlb(struct intel_iommu *iommu, u16 did, u64 addr,
  unsigned int size_order, u64 type);
+extern void qi_flush_piotlb(struct intel_iommu *iommu, u16 did, u64 addr,
+   u32 pasid, unsigned int size_order, u64 type, int ih);
 extern void qi_flush_dev_iotlb(struct intel_iommu *iommu, u16 sid, u16 pfsid,

[PATCH v3 16/16] iommu/vt-d: Add svm/sva invalidate function

2019-05-03 Thread Jacob Pan
When Shared Virtual Address (SVA) is enabled for a guest OS via
vIOMMU, we need to provide invalidation support at IOMMU API and driver
level. This patch adds Intel VT-d specific function to implement
iommu passdown invalidate API for shared virtual address.

The use case is for supporting caching structure invalidation
of assigned SVM capable devices. Emulated IOMMU exposes queue
invalidation capability and passes down all descriptors from the guest
to the physical IOMMU.

The assumption is that guest to host device ID mapping should be
resolved prior to calling IOMMU driver. Based on the device handle,
host IOMMU driver can replace certain fields before submit to the
invalidation queue.

Signed-off-by: Jacob Pan 
Signed-off-by: Ashok Raj 
Signed-off-by: Liu, Yi L 
---
 drivers/iommu/intel-iommu.c | 160 
 1 file changed, 160 insertions(+)

diff --git a/drivers/iommu/intel-iommu.c b/drivers/iommu/intel-iommu.c
index a10cb70..94eb211 100644
--- a/drivers/iommu/intel-iommu.c
+++ b/drivers/iommu/intel-iommu.c
@@ -5340,6 +5340,165 @@ static void intel_iommu_aux_detach_device(struct 
iommu_domain *domain,
aux_domain_remove_dev(to_dmar_domain(domain), dev);
 }
 
+/*
+ * 2D array for converting and sanitizing IOMMU generic TLB granularity to
+ * VT-d granularity. Invalidation is typically included in the unmap operation
+ * as a result of DMA or VFIO unmap. However, for assigned device where guest
+ * could own the first level page tables without being shadowed by QEMU. In
+ * this case there is no pass down unmap to the host IOMMU as a result of unmap
+ * in the guest. Only invalidations are trapped and passed down.
+ * In all cases, only first level TLB invalidation (request with PASID) can be
+ * passed down, therefore we do not include IOTLB granularity for request
+ * without PASID (second level).
+ *
+ * For an example, to find the VT-d granularity encoding for IOTLB
+ * type and page selective granularity within PASID:
+ * X: indexed by iommu cache type
+ * Y: indexed by enum iommu_inv_granularity
+ * [IOMMU_CACHE_INV_TYPE_IOTLB][IOMMU_INV_GRANU_ADDR]
+ *
+ * Granu_map array indicates validity of the table. 1: valid, 0: invalid
+ *
+ */
+const static int inv_type_granu_map[IOMMU_CACHE_TYPE_NR][IOMMU_INVAL_GRANU_NR] 
= {
+   /* PASID based IOTLB, support PASID selective and page selective */
+   {0, 1, 1},
+   /* PASID based dev TLBs, only support all PASIDs or single PASID */
+   {1, 1, 0},
+   /* PASID cache */
+   {1, 1, 0}
+};
+
+const static u64 
inv_type_granu_table[IOMMU_CACHE_TYPE_NR][IOMMU_INVAL_GRANU_NR] = {
+   /* PASID based IOTLB */
+   {0, QI_GRAN_NONG_PASID, QI_GRAN_PSI_PASID},
+   /* PASID based dev TLBs */
+   {QI_DEV_IOTLB_GRAN_ALL, QI_DEV_IOTLB_GRAN_PASID_SEL, 0},
+   /* PASID cache */
+   {QI_PC_ALL_PASIDS, QI_PC_PASID_SEL, 0},
+};
+
+static inline int to_vtd_granularity(int type, int granu, u64 *vtd_granu)
+{
+   if (type >= IOMMU_CACHE_TYPE_NR || granu >= IOMMU_INVAL_GRANU_NR ||
+   !inv_type_granu_map[type][granu])
+   return -EINVAL;
+
+   *vtd_granu = inv_type_granu_table[type][granu];
+
+   return 0;
+}
+
+static inline u64 to_vtd_size(u64 granu_size, u64 nr_granules)
+{
+   u64 nr_pages = (granu_size * nr_granules) >> VTD_PAGE_SHIFT;
+
+   /* VT-d size is encoded as 2^size of 4K pages, 0 for 4k, 9 for 2MB, etc.
+* IOMMU cache invalidate API passes granu_size in bytes, and number of
+* granu size in contiguous memory.
+*/
+   return order_base_2(nr_pages);
+}
+
+#ifdef CONFIG_INTEL_IOMMU_SVM
+static int intel_iommu_sva_invalidate(struct iommu_domain *domain,
+   struct device *dev, struct iommu_cache_invalidate_info 
*inv_info)
+{
+   struct dmar_domain *dmar_domain = to_dmar_domain(domain);
+   struct device_domain_info *info;
+   struct intel_iommu *iommu;
+   unsigned long flags;
+   int cache_type;
+   u8 bus, devfn;
+   u16 did, sid;
+   int ret = 0;
+   u64 granu;
+   u64 size;
+
+   if (!inv_info || !dmar_domain ||
+   inv_info->version != IOMMU_CACHE_INVALIDATE_INFO_VERSION_1)
+   return -EINVAL;
+
+   if (!dev || !dev_is_pci(dev))
+   return -ENODEV;
+
+   iommu = device_to_iommu(dev, , );
+   if (!iommu)
+   return -ENODEV;
+
+   spin_lock_irqsave(_domain_lock, flags);
+   spin_lock(>lock);
+   info = iommu_support_dev_iotlb(dmar_domain, iommu, bus, devfn);
+   if (!info) {
+   ret = -EINVAL;
+   goto out_unlock;
+   }
+   did = dmar_domain->iommu_did[iommu->seq_id];
+   sid = PCI_DEVID(bus, devfn);
+   size = to_vtd_size(inv_info->addr_info.granule_size, 
inv_info->addr_info.nb_granules);
+
+   for_each_set_bit(cache_type, (unsigned long *)_info->cache, 
IOMMU_CACHE_TYPE_NR) {
+
+   ret = 

[PATCH v3 09/16] iommu: Introduce guest PASID bind function

2019-05-03 Thread Jacob Pan
Guest shared virtual address (SVA) may require host to shadow guest
PASID tables. Guest PASID can also be allocated from the host via
enlightened interfaces. In this case, guest needs to bind the guest
mm, i.e. cr3 in guest physical address to the actual PASID table in
the host IOMMU. Nesting will be turned on such that guest virtual
address can go through a two level translation:
- 1st level translates GVA to GPA
- 2nd level translates GPA to HPA
This patch introduces APIs to bind guest PASID data to the assigned
device entry in the physical IOMMU. See the diagram below for usage
explaination.

.-.  .---.
|   vIOMMU|  | Guest process mm, FL only |
| |  '---'
./
| PASID Entry |--- PASID cache flush -
'-'   |
| |   V
| |
'-'
Guest
--| Shadow |--|
  vv  v
Host
.-.  .--.
|   pIOMMU|  | Bind FL for GVA-GPA  |
| |  '--'
./  |
| PASID Entry | V (Nested xlate)
'\.-.
| |   |Set SL to GPA-HPA|
| |   '-'
'-'

Where:
 - FL = First level/stage one page tables
 - SL = Second level/stage two page tables

Signed-off-by: Jacob Pan 
Signed-off-by: Liu Yi L 
---
 drivers/iommu/iommu.c  | 20 
 include/linux/iommu.h  | 10 ++
 include/uapi/linux/iommu.h | 15 ++-
 3 files changed, 44 insertions(+), 1 deletion(-)

diff --git a/drivers/iommu/iommu.c b/drivers/iommu/iommu.c
index a2f6f3e..f8572d2 100644
--- a/drivers/iommu/iommu.c
+++ b/drivers/iommu/iommu.c
@@ -1659,6 +1659,26 @@ int iommu_cache_invalidate(struct iommu_domain *domain, 
struct device *dev,
 }
 EXPORT_SYMBOL_GPL(iommu_cache_invalidate);
 
+int iommu_sva_bind_gpasid(struct iommu_domain *domain,
+   struct device *dev, struct gpasid_bind_data *data)
+{
+   if (unlikely(!domain->ops->sva_bind_gpasid))
+   return -ENODEV;
+
+   return domain->ops->sva_bind_gpasid(domain, dev, data);
+}
+EXPORT_SYMBOL_GPL(iommu_sva_bind_gpasid);
+
+int iommu_sva_unbind_gpasid(struct iommu_domain *domain, struct device *dev,
+   int pasid)
+{
+   if (unlikely(!domain->ops->sva_unbind_gpasid))
+   return -ENODEV;
+
+   return domain->ops->sva_unbind_gpasid(dev, pasid);
+}
+EXPORT_SYMBOL_GPL(iommu_sva_unbind_gpasid);
+
 static void __iommu_detach_device(struct iommu_domain *domain,
  struct device *dev)
 {
diff --git a/include/linux/iommu.h b/include/linux/iommu.h
index d182525..9a69b59 100644
--- a/include/linux/iommu.h
+++ b/include/linux/iommu.h
@@ -268,6 +268,8 @@ struct page_response_msg {
  * @detach_pasid_table: detach the pasid table
  * @cache_invalidate: invalidate translation caches
  * @pgsize_bitmap: bitmap of all possible supported page sizes
+ * @sva_bind_gpasid: bind guest pasid and mm
+ * @sva_unbind_gpasid: unbind guest pasid and mm
  */
 struct iommu_ops {
bool (*capable)(enum iommu_cap);
@@ -332,6 +334,10 @@ struct iommu_ops {
int (*page_response)(struct device *dev, struct page_response_msg *msg);
int (*cache_invalidate)(struct iommu_domain *domain, struct device *dev,
struct iommu_cache_invalidate_info *inv_info);
+   int (*sva_bind_gpasid)(struct iommu_domain *domain,
+   struct device *dev, struct gpasid_bind_data *data);
+
+   int (*sva_unbind_gpasid)(struct device *dev, int pasid);
 
unsigned long pgsize_bitmap;
 };
@@ -447,6 +453,10 @@ extern void iommu_detach_pasid_table(struct iommu_domain 
*domain);
 extern int iommu_cache_invalidate(struct iommu_domain *domain,
  struct device *dev,
  struct iommu_cache_invalidate_info *inv_info);
+extern int iommu_sva_bind_gpasid(struct iommu_domain *domain,
+   struct device *dev, struct gpasid_bind_data *data);
+extern int iommu_sva_unbind_gpasid(struct iommu_domain *domain,
+   struct device *dev, int pasid);
 extern struct iommu_domain *iommu_get_domain_for_dev(struct device *dev);
 extern struct iommu_domain *iommu_get_dma_domain(struct device *dev);
 extern int iommu_map(struct iommu_domain *domain, unsigned long iova,
diff --git a/include/uapi/linux/iommu.h b/include/uapi/linux/iommu.h
index fa96ecb..3a781df 100644
--- a/include/uapi/linux/iommu.h
+++ b/include/uapi/linux/iommu.h
@@ -240,6 +240,19 @@ struct iommu_cache_invalidate_info {
struct iommu_inv_addr_info addr_info;
};
 };
-
+/**
+ * struct gpasid_bind_data - Information about device 

[PATCH v3 12/16] iommu/vt-d: Add nested translation helper function

2019-05-03 Thread Jacob Pan
Nested translation mode is supported in VT-d 3.0 Spec.CH 3.8.
With PASID granular translation type set to 0x11b, translation
result from the first level(FL) also subject to a second level(SL)
page table translation. This mode is used for SVA virtualization,
where FL performs guest virtual to guest physical translation and
SL performs guest physical to host physical translation.

Signed-off-by: Jacob Pan 
Signed-off-by: Liu, Yi L 
---
 drivers/iommu/intel-pasid.c | 93 +
 drivers/iommu/intel-pasid.h | 11 ++
 2 files changed, 104 insertions(+)

diff --git a/drivers/iommu/intel-pasid.c b/drivers/iommu/intel-pasid.c
index dde05b5..d8421f7 100644
--- a/drivers/iommu/intel-pasid.c
+++ b/drivers/iommu/intel-pasid.c
@@ -682,3 +682,96 @@ int intel_pasid_setup_pass_through(struct intel_iommu 
*iommu,
 
return 0;
 }
+
+/**
+ * intel_pasid_setup_nested() - Set up PASID entry for nested translation
+ * which is used for vSVA. The first level page tables are used for
+ * GVA-GPA translation in the guest, second level page tables are used
+ * for GPA to HPA translation.
+ *
+ * @iommu:  Iommu which the device belong to
+ * @dev:Device to be set up for translation
+ * @gpgd:   FLPTPTR: First Level Page translation pointer in GPA
+ * @pasid:  PASID to be programmed in the device PASID table
+ * @flags:  Additional info such as supervisor PASID
+ * @domain: Domain info for setting up second level page tables
+ * @addr_width: Address width of the first level (guest)
+ */
+int intel_pasid_setup_nested(struct intel_iommu *iommu,
+   struct device *dev, pgd_t *gpgd,
+   int pasid, int flags,
+   struct dmar_domain *domain,
+   int addr_width)
+{
+   struct pasid_entry *pte;
+   struct dma_pte *pgd;
+   u64 pgd_val;
+   int agaw;
+   u16 did;
+
+   if (!ecap_nest(iommu->ecap)) {
+   pr_err("IOMMU: %s: No nested translation support\n",
+  iommu->name);
+   return -EINVAL;
+   }
+
+   pte = intel_pasid_get_entry(dev, pasid);
+   if (WARN_ON(!pte))
+   return -EINVAL;
+
+   pasid_clear_entry(pte);
+
+   /* Sanity checking performed by caller to make sure address
+* width matching in two dimensions:
+* 1. CPU vs. IOMMU
+* 2. Guest vs. Host.
+*/
+   switch (addr_width) {
+   case 57:
+   pasid_set_flpm(pte, 1);
+   break;
+   case 48:
+   pasid_set_flpm(pte, 0);
+   break;
+   default:
+   dev_err(dev, "Invalid paging mode %d\n", addr_width);
+   return -EINVAL;
+   }
+
+   /* Setup the first level page table pointer in GPA */
+   pasid_set_flptr(pte, (u64)gpgd);
+   if (flags & PASID_FLAG_SUPERVISOR_MODE) {
+   if (!ecap_srs(iommu->ecap)) {
+   pr_err("No supervisor request support on %s\n",
+  iommu->name);
+   return -EINVAL;
+   }
+   pasid_set_sre(pte);
+   }
+
+   /* Setup the second level based on the given domain */
+   pgd = domain->pgd;
+
+   for (agaw = domain->agaw; agaw != iommu->agaw; agaw--) {
+   pgd = phys_to_virt(dma_pte_addr(pgd));
+   if (!dma_pte_present(pgd)) {
+   dev_err(dev, "Invalid domain page table\n");
+   return -EINVAL;
+   }
+   }
+   pgd_val = virt_to_phys(pgd);
+   pasid_set_slptr(pte, pgd_val);
+   pasid_set_fault_enable(pte);
+
+   did = domain->iommu_did[iommu->seq_id];
+   pasid_set_domain_id(pte, did);
+
+   pasid_set_address_width(pte, agaw);
+   pasid_set_page_snoop(pte, !!ecap_smpwc(iommu->ecap));
+
+   pasid_set_translation_type(pte, PASID_ENTRY_PGTT_NESTED);
+   pasid_set_present(pte);
+   pasid_flush_caches(iommu, pte, pasid, did);
+
+   return 0;
+}
diff --git a/drivers/iommu/intel-pasid.h b/drivers/iommu/intel-pasid.h
index 4b26ab5..2234fd5 100644
--- a/drivers/iommu/intel-pasid.h
+++ b/drivers/iommu/intel-pasid.h
@@ -42,6 +42,7 @@
  * to vmalloc or even module mappings.
  */
 #define PASID_FLAG_SUPERVISOR_MODE BIT(0)
+#define PASID_FLAG_NESTED  BIT(1)
 
 struct pasid_dir_entry {
u64 val;
@@ -51,6 +52,11 @@ struct pasid_entry {
u64 val[8];
 };
 
+#define PASID_ENTRY_PGTT_FL_ONLY   (1)
+#define PASID_ENTRY_PGTT_SL_ONLY   (2)
+#define PASID_ENTRY_PGTT_NESTED(3)
+#define PASID_ENTRY_PGTT_PT(4)
+
 /* The representative of a PASID table */
 struct pasid_table {
void*table; /* pasid table pointer */
@@ -77,6 +83,11 @@ int intel_pasid_setup_second_level(struct intel_iommu *iommu,
 int intel_pasid_setup_pass_through(struct intel_iommu *iommu,

[PATCH v3 14/16] iommu/vt-d: Add bind guest PASID support

2019-05-03 Thread Jacob Pan
When supporting guest SVA with emulated IOMMU, the guest PASID
table is shadowed in VMM. Updates to guest vIOMMU PASID table
will result in PASID cache flush which will be passed down to
the host as bind guest PASID calls.

For the SL page tables, it will be harvested from device's
default domain (request w/o PASID), or aux domain in case of
mediated device.

.-.  .---.
|   vIOMMU|  | Guest process CR3, FL only|
| |  '---'
./
| PASID Entry |--- PASID cache flush -
'-'   |
| |   V
| |CR3 in GPA
'-'
Guest
--| Shadow |--|
  vv  v
Host
.-.  .--.
|   pIOMMU|  | Bind FL for GVA-GPA  |
| |  '--'
./  |
| PASID Entry | V (Nested xlate)
'\.--.
| |   |SL for GPA-HPA, default domain|
| |   '--'
'-'
Where:
 - FL = First level/stage one page tables
 - SL = Second level/stage two page tables

Signed-off-by: Jacob Pan 
Signed-off-by: Liu, Yi L 
---
 drivers/iommu/intel-iommu.c |   4 +
 drivers/iommu/intel-svm.c   | 175 
 include/linux/intel-iommu.h |  10 ++-
 include/linux/intel-svm.h   |   7 ++
 4 files changed, 194 insertions(+), 2 deletions(-)

diff --git a/drivers/iommu/intel-iommu.c b/drivers/iommu/intel-iommu.c
index 1316c96..a10cb70 100644
--- a/drivers/iommu/intel-iommu.c
+++ b/drivers/iommu/intel-iommu.c
@@ -5770,6 +5770,10 @@ const struct iommu_ops intel_iommu_ops = {
.dev_enable_feat= intel_iommu_dev_enable_feat,
.dev_disable_feat   = intel_iommu_dev_disable_feat,
.pgsize_bitmap  = INTEL_IOMMU_PGSIZES,
+#ifdef CONFIG_INTEL_IOMMU_SVM
+   .sva_bind_gpasid= intel_svm_bind_gpasid,
+   .sva_unbind_gpasid  = intel_svm_unbind_gpasid,
+#endif
 };
 
 static void quirk_iommu_g4x_gfx(struct pci_dev *dev)
diff --git a/drivers/iommu/intel-svm.c b/drivers/iommu/intel-svm.c
index 068dd9e..0815615 100644
--- a/drivers/iommu/intel-svm.c
+++ b/drivers/iommu/intel-svm.c
@@ -231,6 +231,181 @@ static LIST_HEAD(global_svm_list);
list_for_each_entry(sdev, >devs, list) \
if (dev == sdev->dev)   \
 
+int intel_svm_bind_gpasid(struct iommu_domain *domain,
+   struct device *dev,
+   struct gpasid_bind_data *data)
+{
+   struct intel_iommu *iommu = intel_svm_device_to_iommu(dev);
+   struct intel_svm_dev *sdev;
+   struct intel_svm *svm = NULL;
+   struct dmar_domain *ddomain;
+   int ret = 0;
+
+   if (WARN_ON(!iommu) || !data)
+   return -EINVAL;
+
+   if (dev_is_pci(dev)) {
+   /* VT-d supports devices with full 20 bit PASIDs only */
+   if (pci_max_pasids(to_pci_dev(dev)) != PASID_MAX)
+   return -EINVAL;
+   }
+
+   if (data->pasid <= 0 || data->pasid >= PASID_MAX)
+   return -EINVAL;
+
+   ddomain = to_dmar_domain(domain);
+   /* REVISIT:
+* Sanity check adddress width and paging mode support
+* width matching in two dimensions:
+* 1. paging mode CPU <= IOMMU
+* 2. address width Guest <= Host.
+*/
+   mutex_lock(_mutex);
+   svm = ioasid_find(NULL, data->pasid, NULL);
+   if (IS_ERR(svm)) {
+   ret = PTR_ERR(svm);
+   goto out;
+   }
+   if (svm) {
+   /*
+* If we found svm for the PASID, there must be at
+* least one device bond, otherwise svm should be freed.
+*/
+   BUG_ON(list_empty(>devs));
+
+   for_each_svm_dev() {
+   /* In case of multiple sub-devices of the same pdev 
assigned, we should
+* allow multiple bind calls with the same PASID and 
pdev.
+*/
+   sdev->users++;
+   goto out;
+   }
+   } else {
+   /* We come here when PASID has never been bond to a device. */
+   svm = kzalloc(sizeof(*svm), GFP_KERNEL);
+   if (!svm) {
+   ret = -ENOMEM;
+   goto out;
+   }
+   /* REVISIT: upper layer/VFIO can track host process that bind 
the PASID.
+* ioasid_set = mm might be sufficient for vfio to check pasid 
VMM
+* ownership.
+*/
+   svm->mm = get_task_mm(current);
+   svm->pasid = data->pasid;
+   refcount_set(>refs, 0);
+

[PATCH v3 11/16] iommu/vt-d: Avoid duplicated code for PASID setup

2019-05-03 Thread Jacob Pan
After each setup for PASID entry, related translation caches must be flushed.
We can combine duplicated code into one function which is less error prone.

Signed-off-by: Jacob Pan 
---
 drivers/iommu/intel-pasid.c | 48 +
 1 file changed, 18 insertions(+), 30 deletions(-)

diff --git a/drivers/iommu/intel-pasid.c b/drivers/iommu/intel-pasid.c
index 2ce6ac2..dde05b5 100644
--- a/drivers/iommu/intel-pasid.c
+++ b/drivers/iommu/intel-pasid.c
@@ -520,6 +520,21 @@ void intel_pasid_tear_down_entry(struct intel_iommu *iommu,
devtlb_invalidation_with_pasid(iommu, dev, pasid);
 }
 
+static inline void pasid_flush_caches(struct intel_iommu *iommu,
+   struct pasid_entry *pte,
+   int pasid, u16 did)
+{
+   if (!ecap_coherent(iommu->ecap))
+   clflush_cache_range(pte, sizeof(*pte));
+
+   if (cap_caching_mode(iommu->cap)) {
+   pasid_cache_invalidation_with_pasid(iommu, did, pasid);
+   iotlb_invalidation_with_pasid(iommu, did, pasid);
+   } else
+   iommu_flush_write_buffer(iommu);
+
+}
+
 /*
  * Set up the scalable mode pasid table entry for first only
  * translation type.
@@ -565,16 +580,7 @@ int intel_pasid_setup_first_level(struct intel_iommu 
*iommu,
/* Setup Present and PASID Granular Transfer Type: */
pasid_set_translation_type(pte, 1);
pasid_set_present(pte);
-
-   if (!ecap_coherent(iommu->ecap))
-   clflush_cache_range(pte, sizeof(*pte));
-
-   if (cap_caching_mode(iommu->cap)) {
-   pasid_cache_invalidation_with_pasid(iommu, did, pasid);
-   iotlb_invalidation_with_pasid(iommu, did, pasid);
-   } else {
-   iommu_flush_write_buffer(iommu);
-   }
+   pasid_flush_caches(iommu, pte, pasid, did);
 
return 0;
 }
@@ -638,16 +644,7 @@ int intel_pasid_setup_second_level(struct intel_iommu 
*iommu,
 */
pasid_set_sre(pte);
pasid_set_present(pte);
-
-   if (!ecap_coherent(iommu->ecap))
-   clflush_cache_range(pte, sizeof(*pte));
-
-   if (cap_caching_mode(iommu->cap)) {
-   pasid_cache_invalidation_with_pasid(iommu, did, pasid);
-   iotlb_invalidation_with_pasid(iommu, did, pasid);
-   } else {
-   iommu_flush_write_buffer(iommu);
-   }
+   pasid_flush_caches(iommu, pte, pasid, did);
 
return 0;
 }
@@ -681,16 +678,7 @@ int intel_pasid_setup_pass_through(struct intel_iommu 
*iommu,
 */
pasid_set_sre(pte);
pasid_set_present(pte);
-
-   if (!ecap_coherent(iommu->ecap))
-   clflush_cache_range(pte, sizeof(*pte));
-
-   if (cap_caching_mode(iommu->cap)) {
-   pasid_cache_invalidation_with_pasid(iommu, did, pasid);
-   iotlb_invalidation_with_pasid(iommu, did, pasid);
-   } else {
-   iommu_flush_write_buffer(iommu);
-   }
+   pasid_flush_caches(iommu, pte, pasid, did);
 
return 0;
 }
-- 
2.7.4

___
iommu mailing list
iommu@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/iommu


[PATCH v3 13/16] iommu/vt-d: Clean up for SVM device list

2019-05-03 Thread Jacob Pan
Use combined macro for_each_svm_dev() to simplify SVM device iteration.

Suggested-by: Andy Shevchenko 
Signed-off-by: Jacob Pan 
Reviewed-by: Eric Auger 
---
 drivers/iommu/intel-svm.c | 79 +++
 1 file changed, 39 insertions(+), 40 deletions(-)

diff --git a/drivers/iommu/intel-svm.c b/drivers/iommu/intel-svm.c
index 8fff212..068dd9e 100644
--- a/drivers/iommu/intel-svm.c
+++ b/drivers/iommu/intel-svm.c
@@ -227,6 +227,9 @@ static const struct mmu_notifier_ops intel_mmuops = {
 
 static DEFINE_MUTEX(pasid_mutex);
 static LIST_HEAD(global_svm_list);
+#define for_each_svm_dev() \
+   list_for_each_entry(sdev, >devs, list) \
+   if (dev == sdev->dev)   \
 
 int intel_svm_bind_mm(struct device *dev, int *pasid, int flags, struct 
svm_dev_ops *ops)
 {
@@ -273,15 +276,13 @@ int intel_svm_bind_mm(struct device *dev, int *pasid, int 
flags, struct svm_dev_
goto out;
}
 
-   list_for_each_entry(sdev, >devs, list) {
-   if (dev == sdev->dev) {
-   if (sdev->ops != ops) {
-   ret = -EBUSY;
-   goto out;
-   }
-   sdev->users++;
-   goto success;
+   for_each_svm_dev() {
+   if (sdev->ops != ops) {
+   ret = -EBUSY;
+   goto out;
}
+   sdev->users++;
+   goto success;
}
 
break;
@@ -411,40 +412,38 @@ int intel_svm_unbind_mm(struct device *dev, int pasid)
if (!svm)
goto out;
 
-   list_for_each_entry(sdev, >devs, list) {
-   if (dev == sdev->dev) {
-   ret = 0;
-   sdev->users--;
-   if (!sdev->users) {
-   list_del_rcu(>list);
-   /* Flush the PASID cache and IOTLB for this 
device.
-* Note that we do depend on the hardware *not* 
using
-* the PASID any more. Just as we depend on 
other
-* devices never using PASIDs that they have no 
right
-* to use. We have a *shared* PASID table, 
because it's
-* large and has to be physically contiguous. 
So it's
-* hard to be as defensive as we might like. */
-   intel_pasid_tear_down_entry(iommu, dev, 
svm->pasid);
-   intel_flush_svm_range_dev(svm, sdev, 0, -1, 0, 
!svm->mm);
-   kfree_rcu(sdev, rcu);
-
-   if (list_empty(>devs)) {
-   ioasid_free(svm->pasid);
-   if (svm->mm)
-   
mmu_notifier_unregister(>notifier, svm->mm);
-
-   list_del(>list);
-
-   /* We mandate that no page faults may 
be outstanding
-* for the PASID when 
intel_svm_unbind_mm() is called.
-* If that is not obeyed, subtle errors 
will happen.
-* Let's make them less subtle... */
-   memset(svm, 0x6b, sizeof(*svm));
-   kfree(svm);
-   }
+   for_each_svm_dev() {
+   ret = 0;
+   sdev->users--;
+   if (!sdev->users) {
+   list_del_rcu(>list);
+   /* Flush the PASID cache and IOTLB for this device.
+* Note that we do depend on the hardware *not* using
+* the PASID any more. Just as we depend on other
+* devices never using PASIDs that they have no right
+* to use. We have a *shared* PASID table, because it's
+* large and has to be physically contiguous. So it's
+* hard to be as defensive as we might like. */
+   intel_pasid_tear_down_entry(iommu, dev, svm->pasid);
+   intel_flush_svm_range_dev(svm, sdev, 0, -1, 0, 
!svm->mm);
+   kfree_rcu(sdev, rcu);
+
+   if (list_empty(>devs)) {
+   ioasid_free(svm->pasid);
+   if (svm->mm)
+ 

[PATCH v3 10/16] iommu/vt-d: Move domain helper to header

2019-05-03 Thread Jacob Pan
Move domainer helper to header to be used by SVA code.

Signed-off-by: Jacob Pan 
---
 drivers/iommu/intel-iommu.c | 6 --
 include/linux/intel-iommu.h | 6 ++
 2 files changed, 6 insertions(+), 6 deletions(-)

diff --git a/drivers/iommu/intel-iommu.c b/drivers/iommu/intel-iommu.c
index 64af526..1316c96 100644
--- a/drivers/iommu/intel-iommu.c
+++ b/drivers/iommu/intel-iommu.c
@@ -427,12 +427,6 @@ static void init_translation_status(struct intel_iommu 
*iommu)
iommu->flags |= VTD_FLAG_TRANS_PRE_ENABLED;
 }
 
-/* Convert generic 'struct iommu_domain to private struct dmar_domain */
-static struct dmar_domain *to_dmar_domain(struct iommu_domain *dom)
-{
-   return container_of(dom, struct dmar_domain, domain);
-}
-
 static int __init intel_iommu_setup(char *str)
 {
if (!str)
diff --git a/include/linux/intel-iommu.h b/include/linux/intel-iommu.h
index c24c8aa..48fa164 100644
--- a/include/linux/intel-iommu.h
+++ b/include/linux/intel-iommu.h
@@ -597,6 +597,12 @@ static inline void __iommu_flush_cache(
clflush_cache_range(addr, size);
 }
 
+/* Convert generic 'struct iommu_domain to private struct dmar_domain */
+static inline struct dmar_domain *to_dmar_domain(struct iommu_domain *dom)
+{
+   return container_of(dom, struct dmar_domain, domain);
+}
+
 /*
  * 0: readable
  * 1: writable
-- 
2.7.4

___
iommu mailing list
iommu@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/iommu


[PATCH v3 06/16] iommu/vt-d: Add custom allocator for IOASID

2019-05-03 Thread Jacob Pan
When VT-d driver runs in the guest, PASID allocation must be
performed via virtual command interface. This patch registers a
custom IOASID allocator which takes precedence over the default
XArray based allocator. The resulting IOASID allocation will always
come from the host. This ensures that PASID namespace is system-
wide.

Signed-off-by: Lu Baolu 
Signed-off-by: Liu, Yi L 
Signed-off-by: Jacob Pan 
---
 drivers/iommu/Kconfig   |  1 +
 drivers/iommu/intel-iommu.c | 60 +
 include/linux/intel-iommu.h |  2 ++
 3 files changed, 63 insertions(+)

diff --git a/drivers/iommu/Kconfig b/drivers/iommu/Kconfig
index 75e7f97..d565ef7 100644
--- a/drivers/iommu/Kconfig
+++ b/drivers/iommu/Kconfig
@@ -210,6 +210,7 @@ config INTEL_IOMMU_SVM
bool "Support for Shared Virtual Memory with Intel IOMMU"
depends on INTEL_IOMMU && X86
select PCI_PASID
+   select IOASID
select MMU_NOTIFIER
help
  Shared Virtual Memory (SVM) provides a facility for devices
diff --git a/drivers/iommu/intel-iommu.c b/drivers/iommu/intel-iommu.c
index d93c4bd..fcc694a 100644
--- a/drivers/iommu/intel-iommu.c
+++ b/drivers/iommu/intel-iommu.c
@@ -1711,6 +1711,8 @@ static void free_dmar_iommu(struct intel_iommu *iommu)
if (ecap_prs(iommu->ecap))
intel_svm_finish_prq(iommu);
}
+   ioasid_unregister_allocator(>pasid_allocator);
+
 #endif
 }
 
@@ -4811,6 +4813,46 @@ static int __init platform_optin_force_iommu(void)
return 1;
 }
 
+#ifdef CONFIG_INTEL_IOMMU_SVM
+static ioasid_t intel_ioasid_alloc(ioasid_t min, ioasid_t max, void *data)
+{
+   struct intel_iommu *iommu = data;
+   ioasid_t ioasid;
+
+   /*
+* VT-d virtual command interface always uses the full 20 bit
+* PASID range. Host can partition guest PASID range based on
+* policies but it is out of guest's control.
+*/
+   if (min < PASID_MIN || max > PASID_MAX)
+   return -EINVAL;
+
+   if (vcmd_alloc_pasid(iommu, ))
+   return INVALID_IOASID;
+
+   return ioasid;
+}
+
+static void intel_ioasid_free(ioasid_t ioasid, void *data)
+{
+   struct iommu_pasid_alloc_info *svm;
+   struct intel_iommu *iommu = data;
+
+   if (!iommu)
+   return;
+   /*
+* Sanity check the ioasid owner is done at upper layer, e.g. VFIO
+* We can only free the PASID when all the devices are unbond.
+*/
+   svm = ioasid_find(NULL, ioasid, NULL);
+   if (!svm) {
+   pr_warn("Freeing unbond IOASID %d\n", ioasid);
+   return;
+   }
+   vcmd_free_pasid(iommu, ioasid);
+}
+#endif
+
 int __init intel_iommu_init(void)
 {
int ret = -ENODEV;
@@ -4912,6 +4954,24 @@ int __init intel_iommu_init(void)
   "%s", iommu->name);
iommu_device_set_ops(>iommu, _iommu_ops);
iommu_device_register(>iommu);
+#ifdef CONFIG_INTEL_IOMMU_SVM
+   if (cap_caching_mode(iommu->cap) && sm_supported(iommu)) {
+   /*
+* Register a custom ASID allocator if we are running
+* in a guest, the purpose is to have a system wide 
PASID
+* namespace among all PASID users.
+* There can be multiple vIOMMUs in each guest but only
+* one allocator is active. All vIOMMU allocators will
+* eventually be calling the same host allocator.
+*/
+   iommu->pasid_allocator.alloc = intel_ioasid_alloc;
+   iommu->pasid_allocator.free = intel_ioasid_free;
+   iommu->pasid_allocator.pdata = (void *)iommu;
+   ret = 
ioasid_register_allocator(>pasid_allocator);
+   if (ret)
+   pr_warn("Custom PASID allocator registeration 
failed\n");
+   }
+#endif
}
 
bus_set_iommu(_bus_type, _iommu_ops);
diff --git a/include/linux/intel-iommu.h b/include/linux/intel-iommu.h
index bff907b..c24c8aa 100644
--- a/include/linux/intel-iommu.h
+++ b/include/linux/intel-iommu.h
@@ -31,6 +31,7 @@
 #include 
 #include 
 #include 
+#include 
 
 #include 
 #include 
@@ -549,6 +550,7 @@ struct intel_iommu {
 #ifdef CONFIG_INTEL_IOMMU_SVM
struct page_req_dsc *prq;
unsigned char prq_name[16];/* Name for PRQ interrupt */
+   struct ioasid_allocator pasid_allocator; /* Custom allocator for PASIDs 
*/
 #endif
struct q_inval  *qi;/* Queued invalidation info */
u32 *iommu_state; /* Store iommu states between suspend and resume.*/
-- 
2.7.4

___
iommu mailing list
iommu@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/iommu


[PATCH v3 02/16] iommu: Introduce cache_invalidate API

2019-05-03 Thread Jacob Pan
From: "Liu, Yi L" 

In any virtualization use case, when the first translation stage
is "owned" by the guest OS, the host IOMMU driver has no knowledge
of caching structure updates unless the guest invalidation activities
are trapped by the virtualizer and passed down to the host.

Since the invalidation data are obtained from user space and will be
written into physical IOMMU, we must allow security check at various
layers. Therefore, generic invalidation data format are proposed here,
model specific IOMMU drivers need to convert them into their own format.

Signed-off-by: Liu, Yi L 
Signed-off-by: Jean-Philippe Brucker 
Signed-off-by: Jacob Pan 
Signed-off-by: Ashok Raj 
Signed-off-by: Eric Auger 

---
v6 -> v7:
- detail which fields are used for each invalidation type
- add a comment about multiple cache invalidation

v5 -> v6:
- fix merge issue

v3 -> v4:
- full reshape of the API following Alex' comments

v1 -> v2:
- add arch_id field
- renamed tlb_invalidate into cache_invalidate as this API allows
  to invalidate context caches on top of IOTLBs

v1:
renamed sva_invalidate into tlb_invalidate and add iommu_ prefix in
header. Commit message reworded.
---
 drivers/iommu/iommu.c  | 14 
 include/linux/iommu.h  | 15 -
 include/uapi/linux/iommu.h | 80 ++
 3 files changed, 108 insertions(+), 1 deletion(-)

diff --git a/drivers/iommu/iommu.c b/drivers/iommu/iommu.c
index 8df9d34..a2f6f3e 100644
--- a/drivers/iommu/iommu.c
+++ b/drivers/iommu/iommu.c
@@ -1645,6 +1645,20 @@ void iommu_detach_pasid_table(struct iommu_domain 
*domain)
 }
 EXPORT_SYMBOL_GPL(iommu_detach_pasid_table);
 
+int iommu_cache_invalidate(struct iommu_domain *domain, struct device *dev,
+  struct iommu_cache_invalidate_info *inv_info)
+{
+   int ret = 0;
+
+   if (unlikely(!domain->ops->cache_invalidate))
+   return -ENODEV;
+
+   ret = domain->ops->cache_invalidate(domain, dev, inv_info);
+
+   return ret;
+}
+EXPORT_SYMBOL_GPL(iommu_cache_invalidate);
+
 static void __iommu_detach_device(struct iommu_domain *domain,
  struct device *dev)
 {
diff --git a/include/linux/iommu.h b/include/linux/iommu.h
index ab4d922..d182525 100644
--- a/include/linux/iommu.h
+++ b/include/linux/iommu.h
@@ -266,6 +266,7 @@ struct page_response_msg {
  * @page_response: handle page request response
  * @attach_pasid_table: attach a pasid table
  * @detach_pasid_table: detach the pasid table
+ * @cache_invalidate: invalidate translation caches
  * @pgsize_bitmap: bitmap of all possible supported page sizes
  */
 struct iommu_ops {
@@ -328,8 +329,9 @@ struct iommu_ops {
int (*attach_pasid_table)(struct iommu_domain *domain,
  struct iommu_pasid_table_config *cfg);
void (*detach_pasid_table)(struct iommu_domain *domain);
-
int (*page_response)(struct device *dev, struct page_response_msg *msg);
+   int (*cache_invalidate)(struct iommu_domain *domain, struct device *dev,
+   struct iommu_cache_invalidate_info *inv_info);
 
unsigned long pgsize_bitmap;
 };
@@ -442,6 +444,9 @@ extern void iommu_detach_device(struct iommu_domain *domain,
 extern int iommu_attach_pasid_table(struct iommu_domain *domain,
struct iommu_pasid_table_config *cfg);
 extern void iommu_detach_pasid_table(struct iommu_domain *domain);
+extern int iommu_cache_invalidate(struct iommu_domain *domain,
+ struct device *dev,
+ struct iommu_cache_invalidate_info *inv_info);
 extern struct iommu_domain *iommu_get_domain_for_dev(struct device *dev);
 extern struct iommu_domain *iommu_get_dma_domain(struct device *dev);
 extern int iommu_map(struct iommu_domain *domain, unsigned long iova,
@@ -982,6 +987,14 @@ static inline int iommu_sva_get_pasid(struct iommu_sva 
*handle)
 static inline
 void iommu_detach_pasid_table(struct iommu_domain *domain) {}
 
+static inline int
+iommu_cache_invalidate(struct iommu_domain *domain,
+  struct device *dev,
+  struct iommu_cache_invalidate_info *inv_info)
+{
+   return -ENODEV;
+}
+
 #endif /* CONFIG_IOMMU_API */
 
 #ifdef CONFIG_IOMMU_DEBUGFS
diff --git a/include/uapi/linux/iommu.h b/include/uapi/linux/iommu.h
index 8848514..fa96ecb 100644
--- a/include/uapi/linux/iommu.h
+++ b/include/uapi/linux/iommu.h
@@ -162,4 +162,84 @@ struct iommu_pasid_table_config {
};
 };
 
+/* defines the granularity of the invalidation */
+enum iommu_inv_granularity {
+   IOMMU_INV_GRANU_DOMAIN, /* domain-selective invalidation */
+   IOMMU_INV_GRANU_PASID,  /* pasid-selective invalidation */
+   IOMMU_INV_GRANU_ADDR,   /* page-selective invalidation */
+   IOMMU_INVAL_GRANU_NR,   /* number of invalidation granularities */
+};
+
+/**
+ * Address Selective Invalidation 

[PATCH v3 07/16] iommu/vtd: Optimize tlb invalidation for vIOMMU

2019-05-03 Thread Jacob Pan
Signed-off-by: Jacob Pan 
---
 drivers/iommu/intel-svm.c | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/drivers/iommu/intel-svm.c b/drivers/iommu/intel-svm.c
index 8f87304..f5d1e1e 100644
--- a/drivers/iommu/intel-svm.c
+++ b/drivers/iommu/intel-svm.c
@@ -211,7 +211,9 @@ static void intel_mm_release(struct mmu_notifier *mn, 
struct mm_struct *mm)
rcu_read_lock();
list_for_each_entry_rcu(sdev, >devs, list) {
intel_pasid_tear_down_entry(svm->iommu, sdev->dev, svm->pasid);
-   intel_flush_svm_range_dev(svm, sdev, 0, -1, 0, !svm->mm);
+   /* for emulated iommu, PASID cache invalidation implies 
IOTLB/DTLB */
+   if (!cap_caching_mode(svm->iommu->cap))
+   intel_flush_svm_range_dev(svm, sdev, 0, -1, 0, 
!svm->mm);
}
rcu_read_unlock();
 
-- 
2.7.4

___
iommu mailing list
iommu@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/iommu


[PATCH v3 01/16] iommu: Introduce attach/detach_pasid_table API

2019-05-03 Thread Jacob Pan
In virtualization use case, when a guest is assigned
a PCI host device, protected by a virtual IOMMU on the guest,
the physical IOMMU must be programmed to be consistent with
the guest mappings. If the physical IOMMU supports two
translation stages it makes sense to program guest mappings
onto the first stage/level (ARM/Intel terminology) while the host
owns the stage/level 2.

In that case, it is mandated to trap on guest configuration
settings and pass those to the physical iommu driver.

This patch adds a new API to the iommu subsystem that allows
to set/unset the pasid table information.

A generic iommu_pasid_table_config struct is introduced in
a new iommu.h uapi header. This is going to be used by the VFIO
user API.

Signed-off-by: Jean-Philippe Brucker 
Signed-off-by: Liu, Yi L 
Signed-off-by: Ashok Raj 
Signed-off-by: Jacob Pan 
Signed-off-by: Eric Auger 
Reviewed-by: Jean-Philippe Brucker 

---

This patch generalizes the API introduced by Jacob & co-authors in
https://lwn.net/Articles/754331/

v4 -> v5:
- no returned valued for dummy definition of iommu_detach_pasid_table
- fix order in comment
- added Jean's R-b

v3 -> v4:
- s/set_pasid_table/attach_pasid_table
- restore detach_pasid_table. Detach can be used on unwind path.
- add padding
- remove @abort
- signature used for config and format
- add comments for fields in the SMMU struct

v2 -> v3:
- replace unbind/bind by set_pasid_table
- move table pointer and pasid bits in the generic part of the struct

v1 -> v2:
- restore the original pasid table name
- remove the struct device * parameter in the API
- reworked iommu_pasid_smmuv3
---
 drivers/iommu/iommu.c  | 19 +++
 include/linux/iommu.h  | 18 ++
 include/uapi/linux/iommu.h | 47 ++
 3 files changed, 84 insertions(+)

diff --git a/drivers/iommu/iommu.c b/drivers/iommu/iommu.c
index 7718568..8df9d34 100644
--- a/drivers/iommu/iommu.c
+++ b/drivers/iommu/iommu.c
@@ -1626,6 +1626,25 @@ int iommu_page_response(struct device *dev,
 }
 EXPORT_SYMBOL_GPL(iommu_page_response);
 
+int iommu_attach_pasid_table(struct iommu_domain *domain,
+struct iommu_pasid_table_config *cfg)
+{
+   if (unlikely(!domain->ops->attach_pasid_table))
+   return -ENODEV;
+
+   return domain->ops->attach_pasid_table(domain, cfg);
+}
+EXPORT_SYMBOL_GPL(iommu_attach_pasid_table);
+
+void iommu_detach_pasid_table(struct iommu_domain *domain)
+{
+   if (unlikely(!domain->ops->detach_pasid_table))
+   return;
+
+   domain->ops->detach_pasid_table(domain);
+}
+EXPORT_SYMBOL_GPL(iommu_detach_pasid_table);
+
 static void __iommu_detach_device(struct iommu_domain *domain,
  struct device *dev)
 {
diff --git a/include/linux/iommu.h b/include/linux/iommu.h
index c56ce85..ab4d922 100644
--- a/include/linux/iommu.h
+++ b/include/linux/iommu.h
@@ -264,6 +264,8 @@ struct page_response_msg {
  * @sva_unbind: Unbind process address space from device
  * @sva_get_pasid: Get PASID associated to a SVA handle
  * @page_response: handle page request response
+ * @attach_pasid_table: attach a pasid table
+ * @detach_pasid_table: detach the pasid table
  * @pgsize_bitmap: bitmap of all possible supported page sizes
  */
 struct iommu_ops {
@@ -323,6 +325,9 @@ struct iommu_ops {
  void *drvdata);
void (*sva_unbind)(struct iommu_sva *handle);
int (*sva_get_pasid)(struct iommu_sva *handle);
+   int (*attach_pasid_table)(struct iommu_domain *domain,
+ struct iommu_pasid_table_config *cfg);
+   void (*detach_pasid_table)(struct iommu_domain *domain);
 
int (*page_response)(struct device *dev, struct page_response_msg *msg);
 
@@ -434,6 +439,9 @@ extern int iommu_attach_device(struct iommu_domain *domain,
   struct device *dev);
 extern void iommu_detach_device(struct iommu_domain *domain,
struct device *dev);
+extern int iommu_attach_pasid_table(struct iommu_domain *domain,
+   struct iommu_pasid_table_config *cfg);
+extern void iommu_detach_pasid_table(struct iommu_domain *domain);
 extern struct iommu_domain *iommu_get_domain_for_dev(struct device *dev);
 extern struct iommu_domain *iommu_get_dma_domain(struct device *dev);
 extern int iommu_map(struct iommu_domain *domain, unsigned long iova,
@@ -943,6 +951,13 @@ iommu_aux_get_pasid(struct iommu_domain *domain, struct 
device *dev)
return -ENODEV;
 }
 
+static inline
+int iommu_attach_pasid_table(struct iommu_domain *domain,
+struct iommu_pasid_table_config *cfg)
+{
+   return -ENODEV;
+}
+
 static inline struct iommu_sva *
 iommu_sva_bind_device(struct device *dev, struct mm_struct *mm, void *drvdata)
 {
@@ -964,6 +979,9 @@ static inline int iommu_sva_get_pasid(struct iommu_sva 

[PATCH v3 08/16] iommu/vt-d: Replace Intel specific PASID allocator with IOASID

2019-05-03 Thread Jacob Pan
Make use of generic IOASID code to manage PASID allocation,
free, and lookup. Replace Intel specific code.

Signed-off-by: Jacob Pan 
---
 drivers/iommu/intel-iommu.c | 11 +--
 drivers/iommu/intel-pasid.c | 36 
 drivers/iommu/intel-svm.c   | 37 +
 3 files changed, 26 insertions(+), 58 deletions(-)

diff --git a/drivers/iommu/intel-iommu.c b/drivers/iommu/intel-iommu.c
index fcc694a..64af526 100644
--- a/drivers/iommu/intel-iommu.c
+++ b/drivers/iommu/intel-iommu.c
@@ -5155,7 +5155,7 @@ static void auxiliary_unlink_device(struct dmar_domain 
*domain,
domain->auxd_refcnt--;
 
if (!domain->auxd_refcnt && domain->default_pasid > 0)
-   intel_pasid_free_id(domain->default_pasid);
+   ioasid_free(domain->default_pasid);
 }
 
 static int aux_domain_add_dev(struct dmar_domain *domain,
@@ -5173,10 +5173,9 @@ static int aux_domain_add_dev(struct dmar_domain *domain,
if (domain->default_pasid <= 0) {
int pasid;
 
-   pasid = intel_pasid_alloc_id(domain, PASID_MIN,
-pci_max_pasids(to_pci_dev(dev)),
-GFP_KERNEL);
-   if (pasid <= 0) {
+   pasid = ioasid_alloc(NULL, PASID_MIN, 
pci_max_pasids(to_pci_dev(dev)) - 1,
+   domain);
+   if (pasid == INVALID_IOASID) {
pr_err("Can't allocate default pasid\n");
return -ENODEV;
}
@@ -5212,7 +5211,7 @@ static int aux_domain_add_dev(struct dmar_domain *domain,
spin_unlock(>lock);
spin_unlock_irqrestore(_domain_lock, flags);
if (!domain->auxd_refcnt && domain->default_pasid > 0)
-   intel_pasid_free_id(domain->default_pasid);
+   ioasid_free(domain->default_pasid);
 
return ret;
 }
diff --git a/drivers/iommu/intel-pasid.c b/drivers/iommu/intel-pasid.c
index 95f8f0c..2ce6ac2 100644
--- a/drivers/iommu/intel-pasid.c
+++ b/drivers/iommu/intel-pasid.c
@@ -26,42 +26,6 @@
  */
 static DEFINE_SPINLOCK(pasid_lock);
 u32 intel_pasid_max_id = PASID_MAX;
-static DEFINE_IDR(pasid_idr);
-
-int intel_pasid_alloc_id(void *ptr, int start, int end, gfp_t gfp)
-{
-   int ret, min, max;
-
-   min = max_t(int, start, PASID_MIN);
-   max = min_t(int, end, intel_pasid_max_id);
-
-   WARN_ON(in_interrupt());
-   idr_preload(gfp);
-   spin_lock(_lock);
-   ret = idr_alloc(_idr, ptr, min, max, GFP_ATOMIC);
-   spin_unlock(_lock);
-   idr_preload_end();
-
-   return ret;
-}
-
-void intel_pasid_free_id(int pasid)
-{
-   spin_lock(_lock);
-   idr_remove(_idr, pasid);
-   spin_unlock(_lock);
-}
-
-void *intel_pasid_lookup_id(int pasid)
-{
-   void *p;
-
-   spin_lock(_lock);
-   p = idr_find(_idr, pasid);
-   spin_unlock(_lock);
-
-   return p;
-}
 
 int vcmd_alloc_pasid(struct intel_iommu *iommu, unsigned int *pasid)
 {
diff --git a/drivers/iommu/intel-svm.c b/drivers/iommu/intel-svm.c
index f5d1e1e..8fff212 100644
--- a/drivers/iommu/intel-svm.c
+++ b/drivers/iommu/intel-svm.c
@@ -25,6 +25,7 @@
 #include 
 #include 
 #include 
+#include 
 #include 
 
 #include "intel-pasid.h"
@@ -334,16 +335,15 @@ int intel_svm_bind_mm(struct device *dev, int *pasid, int 
flags, struct svm_dev_
if (pasid_max > intel_pasid_max_id)
pasid_max = intel_pasid_max_id;
 
-   /* Do not use PASID 0 in caching mode (virtualised IOMMU) */
-   ret = intel_pasid_alloc_id(svm,
-  !!cap_caching_mode(iommu->cap),
-  pasid_max - 1, GFP_KERNEL);
-   if (ret < 0) {
+   /* Do not use PASID 0, reserved for RID to PASID */
+   svm->pasid = ioasid_alloc(NULL, PASID_MIN,
+   pasid_max - 1, svm);
+   if (svm->pasid == INVALID_IOASID) {
kfree(svm);
kfree(sdev);
+   ret = ENOSPC;
goto out;
}
-   svm->pasid = ret;
svm->notifier.ops = _mmuops;
svm->mm = mm;
svm->flags = flags;
@@ -353,7 +353,7 @@ int intel_svm_bind_mm(struct device *dev, int *pasid, int 
flags, struct svm_dev_
if (mm) {
ret = mmu_notifier_register(>notifier, mm);
if (ret) {
-   intel_pasid_free_id(svm->pasid);
+   ioasid_free(svm->pasid);
kfree(svm);
kfree(sdev);
goto out;
@@ -369,7 +369,7 @@ int intel_svm_bind_mm(struct device *dev, int *pasid, int 
flags, struct svm_dev_
if (ret) 

[PATCH v3 04/16] ioasid: Add custom IOASID allocator

2019-05-03 Thread Jacob Pan
Sometimes, IOASID allocation must be handled by platform specific
code. The use cases are guest vIOMMU and pvIOMMU where IOASIDs need
to be allocated by the host via enlightened or paravirt interfaces.

This patch adds an extension to the IOASID allocator APIs such that
platform drivers can register a custom allocator, possibly at boot
time, to take over the allocation. Xarray is still used for tracking
and searching purposes internal to the IOASID code. Private data of
an IOASID can also be set after the allocation.

There can be multiple custom allocators registered but only one is
used at a time. In case of hot removal of devices that provides the
allocator, all IOASIDs must be freed prior to unregistering the
allocator. Default XArray based allocator cannot be mixed with
custom allocators, i.e. custom allocators will not be used if there
are outstanding IOASIDs allocated by the default XA allocator.

Signed-off-by: Jacob Pan 
---
 drivers/iommu/ioasid.c | 125 +
 1 file changed, 125 insertions(+)

diff --git a/drivers/iommu/ioasid.c b/drivers/iommu/ioasid.c
index 99f5e0a..ed2915a 100644
--- a/drivers/iommu/ioasid.c
+++ b/drivers/iommu/ioasid.c
@@ -17,6 +17,100 @@ struct ioasid_data {
 };
 
 static DEFINE_XARRAY_ALLOC(ioasid_xa);
+static DEFINE_MUTEX(ioasid_allocator_lock);
+static struct ioasid_allocator *active_custom_allocator;
+
+static LIST_HEAD(custom_allocators);
+/*
+ * A flag to track if ioasid default allocator is in use, this will
+ * prevent custom allocator from being used. The reason is that custom 
allocator
+ * must have unadulterated space to track private data with xarray, there 
cannot
+ * be a mix been default and custom allocated IOASIDs.
+ */
+static int default_allocator_active;
+
+/**
+ * ioasid_register_allocator - register a custom allocator
+ * @allocator: the custom allocator to be registered
+ *
+ * Custom allocators take precedence over the default xarray based allocator.
+ * Private data associated with the ASID are managed by ASID common code
+ * similar to data stored in xa.
+ *
+ * There can be multiple allocators registered but only one is active. In case
+ * of runtime removal of a custom allocator, the next one is activated based
+ * on the registration ordering.
+ */
+int ioasid_register_allocator(struct ioasid_allocator *allocator)
+{
+   struct ioasid_allocator *pallocator;
+   int ret = 0;
+
+   if (!allocator)
+   return -EINVAL;
+
+   mutex_lock(_allocator_lock);
+   /*
+* No particular preference since all custom allocators end up calling
+* the host to allocate IOASIDs. We activate the first one and keep
+* the later registered allocators in a list in case the first one gets
+* removed due to hotplug.
+*/
+   if (list_empty(_allocators))
+   active_custom_allocator = allocator;
+   else {
+   /* Check if the allocator is already registered */
+   list_for_each_entry(pallocator, _allocators, list) {
+   if (pallocator == allocator) {
+   pr_err("IOASID allocator already registered\n");
+   ret = -EEXIST;
+   goto out_unlock;
+   }
+   }
+   }
+   list_add_tail(>list, _allocators);
+
+out_unlock:
+   mutex_unlock(_allocator_lock);
+   return ret;
+}
+EXPORT_SYMBOL_GPL(ioasid_register_allocator);
+
+/**
+ * ioasid_unregister_allocator - Remove a custom IOASID allocator
+ * @allocator: the custom allocator to be removed
+ *
+ * Remove an allocator from the list, activate the next allocator in
+ * the order it was registered.
+ */
+void ioasid_unregister_allocator(struct ioasid_allocator *allocator)
+{
+   if (!allocator)
+   return;
+
+   if (list_empty(_allocators)) {
+   pr_warn("No custom IOASID allocators active!\n");
+   return;
+   }
+
+   mutex_lock(_allocator_lock);
+   list_del(>list);
+   if (list_empty(_allocators)) {
+   pr_info("No custom IOASID allocators\n");
+   /*
+* All IOASIDs should have been freed before the last custom
+* allocator is unregistered. Unless default allocator is in
+* use.
+*/
+   BUG_ON(!xa_empty(_xa) && !default_allocator_active);
+   active_custom_allocator = NULL;
+   } else if (allocator == active_custom_allocator) {
+   active_custom_allocator = list_entry(_allocators, struct 
ioasid_allocator, list);
+   pr_info("IOASID allocator changed");
+   }
+   mutex_unlock(_allocator_lock);
+}
+EXPORT_SYMBOL_GPL(ioasid_unregister_allocator);
 
 /**
  * ioasid_set_data - Set private data for an allocated ioasid
@@ -68,6 +162,29 @@ ioasid_t ioasid_alloc(struct ioasid_set *set, ioasid_t min, 
ioasid_t max,

[PATCH v3 03/16] iommu: Add I/O ASID allocator

2019-05-03 Thread Jacob Pan
From: Jean-Philippe Brucker 

Some devices might support multiple DMA address spaces, in particular
those that have the PCI PASID feature. PASID (Process Address Space ID)
allows to share process address spaces with devices (SVA), partition a
device into VM-assignable entities (VFIO mdev) or simply provide
multiple DMA address space to kernel drivers. Add a global PASID
allocator usable by different drivers at the same time. Name it I/O ASID
to avoid confusion with ASIDs allocated by arch code, which are usually
a separate ID space.

The IOASID space is global. Each device can have its own PASID space,
but by convention the IOMMU ended up having a global PASID space, so
that with SVA, each mm_struct is associated to a single PASID.

The allocator is primarily used by IOMMU subsystem but in rare occasions
drivers would like to allocate PASIDs for devices that aren't managed by
an IOMMU, using the same ID space as IOMMU.

Signed-off-by: Jean-Philippe Brucker 
Signed-off-by: Jacob Pan 
Link: https://lkml.org/lkml/2019/4/26/462
---
 drivers/iommu/Kconfig  |   6 +++
 drivers/iommu/Makefile |   1 +
 drivers/iommu/ioasid.c | 140 +
 include/linux/ioasid.h |  67 +++
 4 files changed, 214 insertions(+)
 create mode 100644 drivers/iommu/ioasid.c
 create mode 100644 include/linux/ioasid.h

diff --git a/drivers/iommu/Kconfig b/drivers/iommu/Kconfig
index 6f07f3b..75e7f97 100644
--- a/drivers/iommu/Kconfig
+++ b/drivers/iommu/Kconfig
@@ -2,6 +2,12 @@
 config IOMMU_IOVA
tristate
 
+config IOASID
+   bool
+   help
+ Enable the I/O Address Space ID allocator. A single ID space shared
+ between different users.
+
 # IOMMU_API always gets selected by whoever wants it.
 config IOMMU_API
bool
diff --git a/drivers/iommu/Makefile b/drivers/iommu/Makefile
index 8c71a15..0efac6f 100644
--- a/drivers/iommu/Makefile
+++ b/drivers/iommu/Makefile
@@ -7,6 +7,7 @@ obj-$(CONFIG_IOMMU_DMA) += dma-iommu.o
 obj-$(CONFIG_IOMMU_IO_PGTABLE) += io-pgtable.o
 obj-$(CONFIG_IOMMU_IO_PGTABLE_ARMV7S) += io-pgtable-arm-v7s.o
 obj-$(CONFIG_IOMMU_IO_PGTABLE_LPAE) += io-pgtable-arm.o
+obj-$(CONFIG_IOASID) += ioasid.o
 obj-$(CONFIG_IOMMU_IOVA) += iova.o
 obj-$(CONFIG_OF_IOMMU) += of_iommu.o
 obj-$(CONFIG_MSM_IOMMU) += msm_iommu.o
diff --git a/drivers/iommu/ioasid.c b/drivers/iommu/ioasid.c
new file mode 100644
index 000..99f5e0a
--- /dev/null
+++ b/drivers/iommu/ioasid.c
@@ -0,0 +1,140 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * I/O Address Space ID allocator. There is one global IOASID space, split into
+ * subsets. Users create a subset with DECLARE_IOASID_SET, then allocate and
+ * free IOASIDs with ioasid_alloc and ioasid_free.
+ */
+#include 
+#include 
+#include 
+#include 
+
+struct ioasid_data {
+   ioasid_t id;
+   struct ioasid_set *set;
+   void *private;
+   struct rcu_head rcu;
+};
+
+static DEFINE_XARRAY_ALLOC(ioasid_xa);
+
+/**
+ * ioasid_set_data - Set private data for an allocated ioasid
+ * @ioasid: the ID to set data
+ * @data:   the private data
+ *
+ * For IOASID that is already allocated, private data can be set
+ * via this API. Future lookup can be done via ioasid_find.
+ */
+int ioasid_set_data(ioasid_t ioasid, void *data)
+{
+   struct ioasid_data *ioasid_data;
+   int ret = 0;
+
+   ioasid_data = xa_load(_xa, ioasid);
+   if (ioasid_data)
+   ioasid_data->private = data;
+   else
+   ret = -ENOENT;
+
+   /* getter may use the private data */
+   synchronize_rcu();
+
+   return ret;
+}
+EXPORT_SYMBOL_GPL(ioasid_set_data);
+
+/**
+ * ioasid_alloc - Allocate an IOASID
+ * @set: the IOASID set
+ * @min: the minimum ID (inclusive)
+ * @max: the maximum ID (inclusive)
+ * @private: data private to the caller
+ *
+ * Allocate an ID between @min and @max (or %0 and %INT_MAX). Return the
+ * allocated ID on success, or INVALID_IOASID on failure. The @private pointer
+ * is stored internally and can be retrieved with ioasid_find().
+ */
+ioasid_t ioasid_alloc(struct ioasid_set *set, ioasid_t min, ioasid_t max,
+ void *private)
+{
+   int id = INVALID_IOASID;
+   struct ioasid_data *data;
+
+   data = kzalloc(sizeof(*data), GFP_KERNEL);
+   if (!data)
+   return INVALID_IOASID;
+
+   data->set = set;
+   data->private = private;
+
+   if (xa_alloc(_xa, , data, XA_LIMIT(min, max), GFP_KERNEL)) {
+   pr_err("Failed to alloc ioasid from %d to %d\n", min, max);
+   goto exit_free;
+   }
+   data->id = id;
+
+exit_free:
+   if (id < 0 || id == INVALID_IOASID) {
+   kfree(data);
+   return INVALID_IOASID;
+   }
+   return id;
+}
+EXPORT_SYMBOL_GPL(ioasid_alloc);
+
+/**
+ * ioasid_free - Free an IOASID
+ * @ioasid: the ID to remove
+ */
+void ioasid_free(ioasid_t ioasid)
+{
+   struct ioasid_data *ioasid_data;
+
+   

[PATCH v3 05/16] iommu/vt-d: Enlightened PASID allocation

2019-05-03 Thread Jacob Pan
From: Lu Baolu 

If Intel IOMMU runs in caching mode, a.k.a. virtual IOMMU, the
IOMMU driver should rely on the emulation software to allocate
and free PASID IDs. The Intel vt-d spec revision 3.0 defines a
register set to support this. This includes a capability register,
a virtual command register and a virtual response register. Refer
to section 10.4.42, 10.4.43, 10.4.44 for more information.

This patch adds the enlightened PASID allocation/free interfaces
via the virtual command register.

Cc: Ashok Raj 
Cc: Jacob Pan 
Cc: Kevin Tian 
Signed-off-by: Liu Yi L 
Signed-off-by: Lu Baolu 
Signed-off-by: Jacob Pan 
---
 drivers/iommu/intel-pasid.c | 76 +
 drivers/iommu/intel-pasid.h | 13 +++-
 include/linux/intel-iommu.h |  2 ++
 3 files changed, 90 insertions(+), 1 deletion(-)

diff --git a/drivers/iommu/intel-pasid.c b/drivers/iommu/intel-pasid.c
index 03b12d2..95f8f0c 100644
--- a/drivers/iommu/intel-pasid.c
+++ b/drivers/iommu/intel-pasid.c
@@ -63,6 +63,82 @@ void *intel_pasid_lookup_id(int pasid)
return p;
 }
 
+int vcmd_alloc_pasid(struct intel_iommu *iommu, unsigned int *pasid)
+{
+   u64 res;
+   u64 cap;
+   u8 status_code;
+   unsigned long flags;
+   int ret = 0;
+
+   if (!ecap_vcs(iommu->ecap)) {
+   pr_warn("IOMMU: %s: Hardware doesn't support virtual command\n",
+   iommu->name);
+   return -ENODEV;
+   }
+
+   cap = dmar_readq(iommu->reg + DMAR_VCCAP_REG);
+   if (!(cap & DMA_VCS_PAS)) {
+   pr_warn("IOMMU: %s: Emulation software doesn't support PASID 
allocation\n",
+   iommu->name);
+   return -ENODEV;
+   }
+
+   raw_spin_lock_irqsave(>register_lock, flags);
+   dmar_writeq(iommu->reg + DMAR_VCMD_REG, VCMD_CMD_ALLOC);
+   IOMMU_WAIT_OP(iommu, DMAR_VCRSP_REG, dmar_readq,
+ !(res & VCMD_VRSP_IP), res);
+   raw_spin_unlock_irqrestore(>register_lock, flags);
+
+   status_code = VCMD_VRSP_SC(res);
+   switch (status_code) {
+   case VCMD_VRSP_SC_SUCCESS:
+   *pasid = VCMD_VRSP_RESULT(res);
+   break;
+   case VCMD_VRSP_SC_NO_PASID_AVAIL:
+   pr_info("IOMMU: %s: No PASID available\n", iommu->name);
+   ret = -ENOMEM;
+   break;
+   default:
+   ret = -ENODEV;
+   pr_warn("IOMMU: %s: Unkonwn error code %d\n",
+   iommu->name, status_code);
+   }
+
+   return ret;
+}
+
+void vcmd_free_pasid(struct intel_iommu *iommu, unsigned int pasid)
+{
+   u64 res;
+   u8 status_code;
+   unsigned long flags;
+
+   if (!ecap_vcs(iommu->ecap)) {
+   pr_warn("IOMMU: %s: Hardware doesn't support virtual command\n",
+   iommu->name);
+   return;
+   }
+
+   raw_spin_lock_irqsave(>register_lock, flags);
+   dmar_writeq(iommu->reg + DMAR_VCMD_REG, (pasid << 8) | VCMD_CMD_FREE);
+   IOMMU_WAIT_OP(iommu, DMAR_VCRSP_REG, dmar_readq,
+ !(res & VCMD_VRSP_IP), res);
+   raw_spin_unlock_irqrestore(>register_lock, flags);
+
+   status_code = VCMD_VRSP_SC(res);
+   switch (status_code) {
+   case VCMD_VRSP_SC_SUCCESS:
+   break;
+   case VCMD_VRSP_SC_INVALID_PASID:
+   pr_info("IOMMU: %s: Invalid PASID\n", iommu->name);
+   break;
+   default:
+   pr_warn("IOMMU: %s: Unkonwn error code %d\n",
+   iommu->name, status_code);
+   }
+}
+
 /*
  * Per device pasid table management:
  */
diff --git a/drivers/iommu/intel-pasid.h b/drivers/iommu/intel-pasid.h
index 23537b3..4b26ab5 100644
--- a/drivers/iommu/intel-pasid.h
+++ b/drivers/iommu/intel-pasid.h
@@ -19,6 +19,16 @@
 #define PASID_PDE_SHIFT6
 #define MAX_NR_PASID_BITS  20
 
+/* Virtual command interface for enlightened pasid management. */
+#define VCMD_CMD_ALLOC 0x1
+#define VCMD_CMD_FREE  0x2
+#define VCMD_VRSP_IP   0x1
+#define VCMD_VRSP_SC(e)(((e) >> 1) & 0x3)
+#define VCMD_VRSP_SC_SUCCESS   0
+#define VCMD_VRSP_SC_NO_PASID_AVAIL1
+#define VCMD_VRSP_SC_INVALID_PASID 1
+#define VCMD_VRSP_RESULT(e)(((e) >> 8) & 0xf)
+
 /*
  * Domain ID reserved for pasid entries programmed for first-level
  * only and pass-through transfer modes.
@@ -69,5 +79,6 @@ int intel_pasid_setup_pass_through(struct intel_iommu *iommu,
   struct device *dev, int pasid);
 void intel_pasid_tear_down_entry(struct intel_iommu *iommu,
 struct device *dev, int pasid);
-
+int vcmd_alloc_pasid(struct intel_iommu *iommu, unsigned int *pasid);
+void vcmd_free_pasid(struct intel_iommu *iommu, unsigned int pasid);
 #endif /* __INTEL_PASID_H */
diff --git 

[PATCH v3 00/16] Shared virtual address IOMMU and VT-d support

2019-05-03 Thread Jacob Pan
Shared virtual address (SVA), a.k.a, Shared virtual memory (SVM) on Intel
platforms allow address space sharing between device DMA and applications.
SVA can reduce programming complexity and enhance security.
This series is intended to enable SVA virtualization, i.e. shared guest
application address space and physical device DMA address. Only IOMMU portion
of the changes are included in this series. Additional support is needed in
VFIO and QEMU (will be submitted separately) to complete this functionality.

To make incremental changes and reduce the size of each patchset. This series
does not inlcude support for page request services.

In VT-d implementation, PASID table is per device and maintained in the host.
Guest PASID table is shadowed in VMM where virtual IOMMU is emulated.

.-.  .---.
|   vIOMMU|  | Guest process CR3, FL only|
| |  '---'
./
| PASID Entry |--- PASID cache flush -
'-'   |
| |   V
| |CR3 in GPA
'-'
Guest
--| Shadow |--|
  vv  v
Host
.-.  .--.
|   pIOMMU|  | Bind FL for GVA-GPA  |
| |  '--'
./  |
| PASID Entry | V (Nested xlate)
'\.--.
| |   |SL for GPA-HPA, default domain|
| |   '--'
'-'
Where:
 - FL = First level/stage one page tables
 - SL = Second level/stage two page tables


This work is based on collaboration with other developers on the IOMMU
mailing list. Notably,

[1] [PATCH v6 00/22] SMMUv3 Nested Stage Setup by Eric Auger
https://lkml.org/lkml/2019/3/17/124

[2] [RFC PATCH 2/6] drivers core: Add I/O ASID allocator by Jean-Philippe
Brucker
https://www.spinics.net/lists/iommu/msg30639.html

[3] [RFC PATCH 0/5] iommu: APIs for paravirtual PASID allocation by Lu Baolu
https://lkml.org/lkml/2018/11/12/1921

[4] [PATCH v5 00/23] IOMMU and VT-d driver support for Shared Virtual
Address (SVA)
https://lwn.net/Articles/754331/

There are roughly three parts:
1. Generic PASID allocator [1] with extension to support custom allocator
2. IOMMU cache invalidation passdown from guest to host
3. Guest PASID bind for nested translation

All generic IOMMU APIs are reused from [1], which has a v7 just published with
no real impact to the patches used here. It is worth noting that unlike sMMU
nested stage setup, where PASID table is owned by the guest, VT-d PASID table is
owned by the host, individual PASIDs are bound instead of the PASID table.

This series is based on the new VT-d 3.0 Specification 
(https://software.intel.com/sites/default/files/managed/c5/15/vt-directed-io-spec.pdf).
This is different than the older series in [4] which was based on the older
specification that does not have scalable mode.


ChangeLog:
- V3
  - Addressed thorough review comments from Eric Auger (Thank you!)
  - Moved IOASID allocator from driver core to IOMMU code per
suggestion by Christoph Hellwig
(https://lkml.org/lkml/2019/4/26/462)
  - Rebased on top of Jean's SVA API branch and Eric's v7[1]
(git://linux-arm.org/linux-jpb.git sva/api)
  - All IOMMU APIs are unmodified (except the new bind guest PASID
call in patch 9/16)

- V2
  - Rebased on Joerg's IOMMU x86/vt-d branch v5.1-rc4
  - Integrated with Eric Auger's new v7 series for common APIs
  (https://github.com/eauger/linux/tree/v5.1-rc3-2stage-v7)
  - Addressed review comments from Andy Shevchenko and Alex Williamson 
on
IOASID custom allocator.
  - Support multiple custom IOASID allocators (vIOMMUs) and dynamic
registration.


Jacob Pan (13):
  iommu: Introduce attach/detach_pasid_table API
  ioasid: Add custom IOASID allocator
  iommu/vt-d: Add custom allocator for IOASID
  iommu/vtd: Optimize tlb invalidation for vIOMMU
  iommu/vt-d: Replace Intel specific PASID allocator with IOASID
  iommu: Introduce guest PASID bind function
  iommu/vt-d: Move domain helper to header
  iommu/vt-d: Avoid duplicated code for PASID setup
  iommu/vt-d: Add nested translation helper function
  iommu/vt-d: Clean up for SVM device list
  iommu/vt-d: Add bind guest PASID support
  iommu/vt-d: Support flushing more translation cache types
  iommu/vt-d: Add svm/sva invalidate function

Jean-Philippe Brucker (1):
  iommu: Add I/O ASID allocator

Liu, Yi L (1):
  iommu: Introduce cache_invalidate API

Lu Baolu (1):
  iommu/vt-d: Enlightened PASID allocation

 drivers/iommu/Kconfig   |   7 ++
 drivers/iommu/Makefile  |   1 +
 drivers/iommu/dmar.c|  50 
 

Re: "iommu/amd: Set exclusion range correctly" causes smartpqi offline

2019-05-03 Thread Qian Cai
On Mon, 2019-04-29 at 16:23 +0200, Joerg Roedel wrote:
> On Fri, Apr 26, 2019 at 11:55:12AM -0400, Qian Cai wrote:
> > https://git.sr.ht/~cai/linux-debug/blob/master/dmesg
> 
> Thanks, I can't see any definitions for unity ranges or exclusion ranges
> in the IVRS table dump, which makes it even more weird.
> 
> Can you please send me the output of
> 
>   for f in `ls -1 /sys/kernel/iommu_groups/*/reserved_regions`; do echo "-
> --$f"; cat $f;done
> 
> to double-check?

https://git.sr.ht/~cai/linux-debug/blob/master/iommu
___
iommu mailing list
iommu@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/iommu


Re: [PATCH v6 0/3] PCIe Host request to reserve IOVA

2019-05-03 Thread Srinath Mannam via iommu
Hi Lorenzo,

Thanks a lot.

Regards,
Srinath.

On Fri, May 3, 2019 at 9:23 PM Lorenzo Pieralisi
 wrote:
>
> On Fri, May 03, 2019 at 07:35:31PM +0530, Srinath Mannam wrote:
> > This patch set will reserve IOVA addresses for DMA memory holes.
> >
> > The IPROC host controller allows only a few ranges of physical address
> > as inbound PCI addresses which are listed through dma-ranges DT property.
> > Added dma_ranges list field of PCI host bridge structure to hold these
> > allowed inbound address ranges in sorted order.
> >
> > Process this list and reserve IOVA addresses that are not present in its
> > resource entries (ie DMA memory holes) to prevent allocating IOVA
> > addresses that cannot be allocated as inbound addresses.
> >
> > This patch set is based on Linux-5.1-rc3.
> >
> > Changes from v5:
> >   - Addressed Robin Murphy, Lorenzo review comments.
> > - Error handling in dma ranges list processing.
> > - Used commit messages given by Lorenzo to all patches.
> >
> > Changes from v4:
> >   - Addressed Bjorn, Robin Murphy and Auger Eric review comments.
> > - Commit message modification.
> > - Change DMA_BIT_MASK to "~(dma_addr_t)0".
> >
> > Changes from v3:
> >   - Addressed Robin Murphy review comments.
> > - pcie-iproc: parse dma-ranges and make sorted resource list.
> > - dma-iommu: process list and reserve gaps between entries
> >
> > Changes from v2:
> >   - Patch set rebased to Linux-5.0-rc2
> >
> > Changes from v1:
> >   - Addressed Oza review comments.
> >
> > Srinath Mannam (3):
> >   PCI: Add dma_ranges window list
> >   iommu/dma: Reserve IOVA for PCIe inaccessible DMA address
> >   PCI: iproc: Add sorted dma ranges resource entries to host bridge
> >
> >  drivers/iommu/dma-iommu.c   | 35 ++---
> >  drivers/pci/controller/pcie-iproc.c | 44 
> > -
> >  drivers/pci/probe.c |  3 +++
> >  include/linux/pci.h |  1 +
> >  4 files changed, 79 insertions(+), 4 deletions(-)
>
> I have applied the series to pci/iova-dma-ranges, targeting v5.2,
> thanks.
>
> Lorenzo
___
iommu mailing list
iommu@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/iommu


Re: [PATCH v6 0/3] PCIe Host request to reserve IOVA

2019-05-03 Thread Lorenzo Pieralisi
On Fri, May 03, 2019 at 07:35:31PM +0530, Srinath Mannam wrote:
> This patch set will reserve IOVA addresses for DMA memory holes.
> 
> The IPROC host controller allows only a few ranges of physical address
> as inbound PCI addresses which are listed through dma-ranges DT property.
> Added dma_ranges list field of PCI host bridge structure to hold these
> allowed inbound address ranges in sorted order.
> 
> Process this list and reserve IOVA addresses that are not present in its
> resource entries (ie DMA memory holes) to prevent allocating IOVA
> addresses that cannot be allocated as inbound addresses.
> 
> This patch set is based on Linux-5.1-rc3.
> 
> Changes from v5:
>   - Addressed Robin Murphy, Lorenzo review comments.
> - Error handling in dma ranges list processing.
> - Used commit messages given by Lorenzo to all patches.
> 
> Changes from v4:
>   - Addressed Bjorn, Robin Murphy and Auger Eric review comments.
> - Commit message modification.
> - Change DMA_BIT_MASK to "~(dma_addr_t)0".
> 
> Changes from v3:
>   - Addressed Robin Murphy review comments.
> - pcie-iproc: parse dma-ranges and make sorted resource list.
> - dma-iommu: process list and reserve gaps between entries
> 
> Changes from v2:
>   - Patch set rebased to Linux-5.0-rc2
> 
> Changes from v1:
>   - Addressed Oza review comments.
> 
> Srinath Mannam (3):
>   PCI: Add dma_ranges window list
>   iommu/dma: Reserve IOVA for PCIe inaccessible DMA address
>   PCI: iproc: Add sorted dma ranges resource entries to host bridge
> 
>  drivers/iommu/dma-iommu.c   | 35 ++---
>  drivers/pci/controller/pcie-iproc.c | 44 
> -
>  drivers/pci/probe.c |  3 +++
>  include/linux/pci.h |  1 +
>  4 files changed, 79 insertions(+), 4 deletions(-)

I have applied the series to pci/iova-dma-ranges, targeting v5.2,
thanks.

Lorenzo
___
iommu mailing list
iommu@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/iommu


Re: Linux crash when using FTDI FT232R USB to serial converter on AMD boards.

2019-05-03 Thread Joerg Roedel
On Mon, Apr 29, 2019 at 11:48:47AM +0200, Johan Hovold wrote:
> So this is a debian 4.18 kernel seemingly crashing due to a xhci or
> iommu issue.
> 
> Can you reproduce this on a mainline kernel?
> 
> If so, please post the corresponding logs to the lists and CC the xhci
> and iommu maintainers (added to CC).

Your kernel is probably missing this upstream fix:

4e50ce03976f iommu/amd: fix sg->dma_address for sg->offset bigger than 
PAGE_SIZE

Regards,

Joerg

___
iommu mailing list
iommu@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/iommu


Re: [PATCH] iommu/vt-d: Fix leak in intel_pasid_alloc_table on error path

2019-05-03 Thread Joerg Roedel
On Tue, Apr 30, 2019 at 09:30:04AM +0200, Eric Auger wrote:
> If alloc_pages_node() fails, pasid_table is leaked. Free it.
> 
> Fixes: cc580e41260db ("iommu/vt-d: Per PCI device pasid table interfaces")
> 
> Signed-off-by: Eric Auger 
> ---
>  drivers/iommu/intel-pasid.c | 4 +++-
>  1 file changed, 3 insertions(+), 1 deletion(-)

Applied, thanks.

___
iommu mailing list
iommu@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/iommu


Re: [PATCH 0/2] iommu/vt-d: Small fixes for 5.2-rc1

2019-05-03 Thread Joerg Roedel
On Thu, May 02, 2019 at 09:34:24AM +0800, Lu Baolu wrote:
> Lu Baolu (2):
>   iommu/vt-d: Set intel_iommu_gfx_mapped correctly
>   iommu/vt-d: Make kernel parameter igfx_off work with vIOMMU

Applied both, thanks.
___
iommu mailing list
iommu@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/iommu


Re: [PATCH v2] iommu/amd: flush not present cache in iommu_map_page

2019-05-03 Thread Joerg Roedel
On Mon, Apr 29, 2019 at 12:47:02AM +0100, Tom Murphy wrote:
> check if there is a not-present cache present and flush it if there is.
> 
> Signed-off-by: Tom Murphy 
> ---
>  drivers/iommu/amd_iommu.c | 19 +++
>  1 file changed, 15 insertions(+), 4 deletions(-)

Applied, thanks.

___
iommu mailing list
iommu@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/iommu


Re: [PATCH 1/1] iommu/vt-d: Cleanup: no spaces at the start of a line

2019-05-03 Thread Joerg Roedel
On Mon, Apr 29, 2019 at 09:16:02AM +0800, Lu Baolu wrote:
> Replace the whitespaces at the start of a line with tabs. No
> functional changes.
> 
> Signed-off-by: Lu Baolu 
> ---
>  drivers/iommu/intel-iommu.c | 53 +++--
>  1 file changed, 27 insertions(+), 26 deletions(-)

Applied, thanks.

___
iommu mailing list
iommu@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/iommu


Re: [PATCH v3 6/7] irqchip/gic-v3-mbi: Don't map the MSI page in mbi_compose_m{b, s}i_msg()

2019-05-03 Thread Robin Murphy

On 03/05/2019 15:27, Marc Zyngier wrote:

On 01/05/2019 14:58, Julien Grall wrote:

The functions mbi_compose_m{b, s}i_msg may be called from non-preemptible
context. However, on RT, iommu_dma_map_msi_msg() requires to be called
from a preemptible context.

A recent patch split iommu_dma_map_msi_msg in two new functions:
one that should be called in preemptible context, the other does
not have any requirement.

The GICv3 MSI driver is reworked to avoid executing preemptible code in
non-preemptible context. This can be achieved by preparing the two MSI
mappings when allocating the MSI interrupt.

Signed-off-by: Julien Grall 

---
 Changes in v2:
 - Rework the commit message to use imperative mood
---
  drivers/irqchip/irq-gic-v3-mbi.c | 15 +--
  1 file changed, 13 insertions(+), 2 deletions(-)

diff --git a/drivers/irqchip/irq-gic-v3-mbi.c b/drivers/irqchip/irq-gic-v3-mbi.c
index fbfa7ff6deb1..d50f6cdf043c 100644
--- a/drivers/irqchip/irq-gic-v3-mbi.c
+++ b/drivers/irqchip/irq-gic-v3-mbi.c
@@ -84,6 +84,7 @@ static void mbi_free_msi(struct mbi_range *mbi, unsigned int 
hwirq,
  static int mbi_irq_domain_alloc(struct irq_domain *domain, unsigned int virq,
   unsigned int nr_irqs, void *args)
  {
+   msi_alloc_info_t *info = args;
struct mbi_range *mbi = NULL;
int hwirq, offset, i, err = 0;
  
@@ -104,6 +105,16 @@ static int mbi_irq_domain_alloc(struct irq_domain *domain, unsigned int virq,
  
  	hwirq = mbi->spi_start + offset;
  
+	err = iommu_dma_prepare_msi(info->desc,

+   mbi_phys_base + GICD_CLRSPI_NSR);
+   if (err)
+   return err;
+
+   err = iommu_dma_prepare_msi(info->desc,
+   mbi_phys_base + GICD_SETSPI_NSR);
+   if (err)
+   return err;


Nit here: The two registers are guaranteed to be in the same 4k page
(respectively offsets 0x48 and 0x40), so the second call is at best
useless. I'll fix it when applying it, no need to resend.


Ack; the second call will simply find the existing page and overwrite 
desc->iommu_cookie with the same value. And if the two calls *did* ever 
come back with different pages, that logic would be quietly broken anyway.


Robin.
___
iommu mailing list
iommu@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/iommu


Re: [PATCH v3 6/7] irqchip/gic-v3-mbi: Don't map the MSI page in mbi_compose_m{b, s}i_msg()

2019-05-03 Thread Marc Zyngier
On 01/05/2019 14:58, Julien Grall wrote:
> The functions mbi_compose_m{b, s}i_msg may be called from non-preemptible
> context. However, on RT, iommu_dma_map_msi_msg() requires to be called
> from a preemptible context.
> 
> A recent patch split iommu_dma_map_msi_msg in two new functions:
> one that should be called in preemptible context, the other does
> not have any requirement.
> 
> The GICv3 MSI driver is reworked to avoid executing preemptible code in
> non-preemptible context. This can be achieved by preparing the two MSI
> mappings when allocating the MSI interrupt.
> 
> Signed-off-by: Julien Grall 
> 
> ---
> Changes in v2:
> - Rework the commit message to use imperative mood
> ---
>  drivers/irqchip/irq-gic-v3-mbi.c | 15 +--
>  1 file changed, 13 insertions(+), 2 deletions(-)
> 
> diff --git a/drivers/irqchip/irq-gic-v3-mbi.c 
> b/drivers/irqchip/irq-gic-v3-mbi.c
> index fbfa7ff6deb1..d50f6cdf043c 100644
> --- a/drivers/irqchip/irq-gic-v3-mbi.c
> +++ b/drivers/irqchip/irq-gic-v3-mbi.c
> @@ -84,6 +84,7 @@ static void mbi_free_msi(struct mbi_range *mbi, unsigned 
> int hwirq,
>  static int mbi_irq_domain_alloc(struct irq_domain *domain, unsigned int virq,
>  unsigned int nr_irqs, void *args)
>  {
> + msi_alloc_info_t *info = args;
>   struct mbi_range *mbi = NULL;
>   int hwirq, offset, i, err = 0;
>  
> @@ -104,6 +105,16 @@ static int mbi_irq_domain_alloc(struct irq_domain 
> *domain, unsigned int virq,
>  
>   hwirq = mbi->spi_start + offset;
>  
> + err = iommu_dma_prepare_msi(info->desc,
> + mbi_phys_base + GICD_CLRSPI_NSR);
> + if (err)
> + return err;
> +
> + err = iommu_dma_prepare_msi(info->desc,
> + mbi_phys_base + GICD_SETSPI_NSR);
> + if (err)
> + return err;

Nit here: The two registers are guaranteed to be in the same 4k page
(respectively offsets 0x48 and 0x40), so the second call is at best
useless. I'll fix it when applying it, no need to resend.

Thanks,

M.
-- 
Jazz is not dead. It just smells funny...
___
iommu mailing list
iommu@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/iommu


[PATCH v6 2/3] iommu/dma: Reserve IOVA for PCIe inaccessible DMA address

2019-05-03 Thread Srinath Mannam via iommu
The dma_ranges list field of PCI host bridge structure has resource
entries in sorted order representing address ranges allowed for DMA
transfers.

Process the list and reserve IOVA addresses that are not present in its
resource entries (ie DMA memory holes) to prevent allocating IOVA
addresses that cannot be accessed by PCI devices.

Based-on-patch-by: Oza Pawandeep 
Signed-off-by: Srinath Mannam 
Signed-off-by: Lorenzo Pieralisi 
Reviewed-by: Oza Pawandeep 
Acked-by: Robin Murphy 
---
 drivers/iommu/dma-iommu.c | 35 ---
 1 file changed, 32 insertions(+), 3 deletions(-)

diff --git a/drivers/iommu/dma-iommu.c b/drivers/iommu/dma-iommu.c
index 77aabe6..954ae11 100644
--- a/drivers/iommu/dma-iommu.c
+++ b/drivers/iommu/dma-iommu.c
@@ -206,12 +206,13 @@ static int cookie_init_hw_msi_region(struct 
iommu_dma_cookie *cookie,
return 0;
 }
 
-static void iova_reserve_pci_windows(struct pci_dev *dev,
+static int iova_reserve_pci_windows(struct pci_dev *dev,
struct iova_domain *iovad)
 {
struct pci_host_bridge *bridge = pci_find_host_bridge(dev->bus);
struct resource_entry *window;
unsigned long lo, hi;
+   phys_addr_t start = 0, end;
 
resource_list_for_each_entry(window, >windows) {
if (resource_type(window->res) != IORESOURCE_MEM)
@@ -221,6 +222,31 @@ static void iova_reserve_pci_windows(struct pci_dev *dev,
hi = iova_pfn(iovad, window->res->end - window->offset);
reserve_iova(iovad, lo, hi);
}
+
+   /* Get reserved DMA windows from host bridge */
+   resource_list_for_each_entry(window, >dma_ranges) {
+   end = window->res->start - window->offset;
+resv_iova:
+   if (end > start) {
+   lo = iova_pfn(iovad, start);
+   hi = iova_pfn(iovad, end);
+   reserve_iova(iovad, lo, hi);
+   } else {
+   /* dma_ranges list should be sorted */
+   dev_err(>dev, "Failed to reserve IOVA\n");
+   return -EINVAL;
+   }
+
+   start = window->res->end - window->offset + 1;
+   /* If window is last entry */
+   if (window->node.next == >dma_ranges &&
+   end != ~(dma_addr_t)0) {
+   end = ~(dma_addr_t)0;
+   goto resv_iova;
+   }
+   }
+
+   return 0;
 }
 
 static int iova_reserve_iommu_regions(struct device *dev,
@@ -232,8 +258,11 @@ static int iova_reserve_iommu_regions(struct device *dev,
LIST_HEAD(resv_regions);
int ret = 0;
 
-   if (dev_is_pci(dev))
-   iova_reserve_pci_windows(to_pci_dev(dev), iovad);
+   if (dev_is_pci(dev)) {
+   ret = iova_reserve_pci_windows(to_pci_dev(dev), iovad);
+   if (ret)
+   return ret;
+   }
 
iommu_get_resv_regions(dev, _regions);
list_for_each_entry(region, _regions, list) {
-- 
2.7.4

___
iommu mailing list
iommu@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/iommu


[PATCH v6 3/3] PCI: iproc: Add sorted dma ranges resource entries to host bridge

2019-05-03 Thread Srinath Mannam via iommu
The IPROC host controller allows only a subset of physical address space
as target of inbound PCI memory transactions addresses.

PCIe devices memory transactions targeting memory regions that
are not allowed for inbound transactions in the host controller
are rejected by the host controller and cannot reach the upstream
buses.

Firmware device tree description defines the DMA ranges that are
addressable by devices DMA transactions; parse the device tree
dma-ranges property and add its ranges to the PCI host bridge dma_ranges
list; the iova_reserve_pci_windows() call in the driver will reserve the
IOVA address ranges that are not addressable (ie memory holes in the
dma-ranges set) so that they are not allocated to PCI devices for DMA
transfers.

All allowed address ranges are listed in dma-ranges DT parameter.

Example:

dma-ranges = < \
  0x4300 0x00 0x8000 0x00 0x8000 0x00 0x8000 \
  0x4300 0x08 0x 0x08 0x 0x08 0x \
  0x4300 0x80 0x 0x80 0x 0x40 0x>

In the above example of dma-ranges, memory address from

0x0 - 0x8000,
0x1 - 0x8,
0x10 - 0x80 and
0x100 - 0x.

are not allowed to be used as inbound addresses.

Based-on-patch-by: Oza Pawandeep 
Signed-off-by: Srinath Mannam 
[lorenzo.pieral...@arm.com: updated commit log]
Signed-off-by: Lorenzo Pieralisi 
Reviewed-by: Oza Pawandeep 
Reviewed-by: Eric Auger 
---
 drivers/pci/controller/pcie-iproc.c | 44 -
 1 file changed, 43 insertions(+), 1 deletion(-)

diff --git a/drivers/pci/controller/pcie-iproc.c 
b/drivers/pci/controller/pcie-iproc.c
index c20fd6b..94ba5c0 100644
--- a/drivers/pci/controller/pcie-iproc.c
+++ b/drivers/pci/controller/pcie-iproc.c
@@ -1146,11 +1146,43 @@ static int iproc_pcie_setup_ib(struct iproc_pcie *pcie,
return ret;
 }
 
+static int
+iproc_pcie_add_dma_range(struct device *dev, struct list_head *resources,
+struct of_pci_range *range)
+{
+   struct resource *res;
+   struct resource_entry *entry, *tmp;
+   struct list_head *head = resources;
+
+   res = devm_kzalloc(dev, sizeof(struct resource), GFP_KERNEL);
+   if (!res)
+   return -ENOMEM;
+
+   resource_list_for_each_entry(tmp, resources) {
+   if (tmp->res->start < range->cpu_addr)
+   head = >node;
+   }
+
+   res->start = range->cpu_addr;
+   res->end = res->start + range->size - 1;
+
+   entry = resource_list_create_entry(res, 0);
+   if (!entry)
+   return -ENOMEM;
+
+   entry->offset = res->start - range->cpu_addr;
+   resource_list_add(entry, head);
+
+   return 0;
+}
+
 static int iproc_pcie_map_dma_ranges(struct iproc_pcie *pcie)
 {
+   struct pci_host_bridge *host = pci_host_bridge_from_priv(pcie);
struct of_pci_range range;
struct of_pci_range_parser parser;
int ret;
+   LIST_HEAD(resources);
 
/* Get the dma-ranges from DT */
ret = of_pci_dma_range_parser_init(, pcie->dev->of_node);
@@ -1158,13 +1190,23 @@ static int iproc_pcie_map_dma_ranges(struct iproc_pcie 
*pcie)
return ret;
 
for_each_of_pci_range(, ) {
+   ret = iproc_pcie_add_dma_range(pcie->dev,
+  ,
+  );
+   if (ret)
+   goto out;
/* Each range entry corresponds to an inbound mapping region */
ret = iproc_pcie_setup_ib(pcie, , IPROC_PCIE_IB_MAP_MEM);
if (ret)
-   return ret;
+   goto out;
}
 
+   list_splice_init(, >dma_ranges);
+
return 0;
+out:
+   pci_free_resource_list();
+   return ret;
 }
 
 static int iproce_pcie_get_msi(struct iproc_pcie *pcie,
-- 
2.7.4

___
iommu mailing list
iommu@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/iommu


[PATCH v6 0/3] PCIe Host request to reserve IOVA

2019-05-03 Thread Srinath Mannam via iommu
This patch set will reserve IOVA addresses for DMA memory holes.

The IPROC host controller allows only a few ranges of physical address
as inbound PCI addresses which are listed through dma-ranges DT property.
Added dma_ranges list field of PCI host bridge structure to hold these
allowed inbound address ranges in sorted order.

Process this list and reserve IOVA addresses that are not present in its
resource entries (ie DMA memory holes) to prevent allocating IOVA
addresses that cannot be allocated as inbound addresses.

This patch set is based on Linux-5.1-rc3.

Changes from v5:
  - Addressed Robin Murphy, Lorenzo review comments.
- Error handling in dma ranges list processing.
- Used commit messages given by Lorenzo to all patches.

Changes from v4:
  - Addressed Bjorn, Robin Murphy and Auger Eric review comments.
- Commit message modification.
- Change DMA_BIT_MASK to "~(dma_addr_t)0".

Changes from v3:
  - Addressed Robin Murphy review comments.
- pcie-iproc: parse dma-ranges and make sorted resource list.
- dma-iommu: process list and reserve gaps between entries

Changes from v2:
  - Patch set rebased to Linux-5.0-rc2

Changes from v1:
  - Addressed Oza review comments.

Srinath Mannam (3):
  PCI: Add dma_ranges window list
  iommu/dma: Reserve IOVA for PCIe inaccessible DMA address
  PCI: iproc: Add sorted dma ranges resource entries to host bridge

 drivers/iommu/dma-iommu.c   | 35 ++---
 drivers/pci/controller/pcie-iproc.c | 44 -
 drivers/pci/probe.c |  3 +++
 include/linux/pci.h |  1 +
 4 files changed, 79 insertions(+), 4 deletions(-)

-- 
2.7.4

___
iommu mailing list
iommu@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/iommu


[PATCH v6 1/3] PCI: Add dma_ranges window list

2019-05-03 Thread Srinath Mannam via iommu
Add a dma_ranges field in PCI host bridge structure to hold resource
entries list of memory regions in sorted order representing memory
ranges that can be accessed through DMA transactions.

Based-on-patch-by: Oza Pawandeep 
Signed-off-by: Srinath Mannam 
[lorenzo.pieral...@arm.com: updated commit log]
Signed-off-by: Lorenzo Pieralisi 
Reviewed-by: Oza Pawandeep 
Acked-by: Bjorn Helgaas 
---
 drivers/pci/probe.c | 3 +++
 include/linux/pci.h | 1 +
 2 files changed, 4 insertions(+)

diff --git a/drivers/pci/probe.c b/drivers/pci/probe.c
index 7e12d01..72563c1 100644
--- a/drivers/pci/probe.c
+++ b/drivers/pci/probe.c
@@ -595,6 +595,7 @@ struct pci_host_bridge *pci_alloc_host_bridge(size_t priv)
return NULL;
 
INIT_LIST_HEAD(>windows);
+   INIT_LIST_HEAD(>dma_ranges);
bridge->dev.release = pci_release_host_bridge_dev;
 
/*
@@ -623,6 +624,7 @@ struct pci_host_bridge *devm_pci_alloc_host_bridge(struct 
device *dev,
return NULL;
 
INIT_LIST_HEAD(>windows);
+   INIT_LIST_HEAD(>dma_ranges);
bridge->dev.release = devm_pci_release_host_bridge_dev;
 
return bridge;
@@ -632,6 +634,7 @@ EXPORT_SYMBOL(devm_pci_alloc_host_bridge);
 void pci_free_host_bridge(struct pci_host_bridge *bridge)
 {
pci_free_resource_list(>windows);
+   pci_free_resource_list(>dma_ranges);
 
kfree(bridge);
 }
diff --git a/include/linux/pci.h b/include/linux/pci.h
index 7744821..bba0a29 100644
--- a/include/linux/pci.h
+++ b/include/linux/pci.h
@@ -490,6 +490,7 @@ struct pci_host_bridge {
void*sysdata;
int busnr;
struct list_head windows;   /* resource_entry */
+   struct list_head dma_ranges;/* dma ranges resource list */
u8 (*swizzle_irq)(struct pci_dev *, u8 *); /* Platform IRQ swizzler */
int (*map_irq)(const struct pci_dev *, u8, u8);
void (*release_fn)(struct pci_host_bridge *);
-- 
2.7.4

___
iommu mailing list
iommu@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/iommu


Re: [PATCH v3 7/7] iommu/dma-iommu: Remove iommu_dma_map_msi_msg()

2019-05-03 Thread Joerg Roedel
On Wed, May 01, 2019 at 02:58:24PM +0100, Julien Grall wrote:
> A recent change split iommu_dma_map_msi_msg() in two new functions. The
> function was still implemented to avoid modifying all the callers at
> once.
> 
> Now that all the callers have been reworked, iommu_dma_map_msi_msg() can
> be removed.
> 
> Signed-off-by: Julien Grall 
> Reviewed-by: Robin Murphy 
> Reviewed-by: Eric Auger 

Acked-by: Joerg Roedel 
___
iommu mailing list
iommu@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/iommu


Re: [PATCH v3 2/7] iommu/dma-iommu: Split iommu_dma_map_msi_msg() in two parts

2019-05-03 Thread Joerg Roedel
On Wed, May 01, 2019 at 02:58:19PM +0100, Julien Grall wrote:
> On RT, iommu_dma_map_msi_msg() may be called from non-preemptible
> context. This will lead to a splat with CONFIG_DEBUG_ATOMIC_SLEEP as
> the function is using spin_lock (they can sleep on RT).
> 
> iommu_dma_map_msi_msg() is used to map the MSI page in the IOMMU PT
> and update the MSI message with the IOVA.
> 
> Only the part to lookup for the MSI page requires to be called in
> preemptible context. As the MSI page cannot change over the lifecycle
> of the MSI interrupt, the lookup can be cached and re-used later on.
> 
> iomma_dma_map_msi_msg() is now split in two functions:
> - iommu_dma_prepare_msi(): This function will prepare the mapping
> in the IOMMU and store the cookie in the structure msi_desc. This
> function should be called in preemptible context.
> - iommu_dma_compose_msi_msg(): This function will update the MSI
> message with the IOVA when the device is behind an IOMMU.
> 
> Signed-off-by: Julien Grall 
> Reviewed-by: Robin Murphy 
> Reviewed-by: Eric Auguer 

Acked-by: Joerg Roedel 
___
iommu mailing list
iommu@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/iommu


Re: implement generic dma_map_ops for IOMMUs v4

2019-05-03 Thread Christoph Hellwig
On Fri, May 03, 2019 at 12:47:31PM +0100, Catalin Marinas wrote:
> On Thu, May 02, 2019 at 03:22:08PM +0200, Christoph Hellwig wrote:
> > can you quickly look over the arm64 parts?  I'd really like to still
> > get this series in for this merge window as it would conflict with
> > a lot of dma-mapping work for next merge window, and we also have
> > the amd and possibly intel iommu conversions to use it waiting.
> 
> Done. They look fine to me.

Ok, great.  Although I have to admit I feel about uneasy about
merging them for 5.2 unless Linus ends up doing an rc8.  I plan to
pull the prep_coherent patch into the dma-mapping tree as we'll
need it as a prepation for other things as well, and then we can
just have an immutable tree with the dma-iommu changes, so that
it doesn't block other DMA mapping changes that will touch this
code as well.

> 
> -- 
> Catalin
---end quoted text---
___
iommu mailing list
iommu@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/iommu


Re: implement generic dma_map_ops for IOMMUs v4

2019-05-03 Thread Catalin Marinas
On Thu, May 02, 2019 at 03:22:08PM +0200, Christoph Hellwig wrote:
> can you quickly look over the arm64 parts?  I'd really like to still
> get this series in for this merge window as it would conflict with
> a lot of dma-mapping work for next merge window, and we also have
> the amd and possibly intel iommu conversions to use it waiting.

Done. They look fine to me.

-- 
Catalin
___
iommu mailing list
iommu@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/iommu


Re: [PATCH 24/25] arm64: switch copyright boilerplace to SPDX in dma-mapping.c

2019-05-03 Thread Catalin Marinas
On Tue, Apr 30, 2019 at 06:52:13AM -0400, Christoph Hellwig wrote:
> Signed-off-by: Christoph Hellwig 
> Acked-by: Robin Murphy 
> Reviewed-by: Mukesh Ojha 

Acked-by: Catalin Marinas 
___
iommu mailing list
iommu@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/iommu


Re: [PATCH 25/25] arm64: trim includes in dma-mapping.c

2019-05-03 Thread Catalin Marinas
On Tue, Apr 30, 2019 at 06:52:14AM -0400, Christoph Hellwig wrote:
> With most of the previous functionality now elsewhere a lot of the
> headers included in this file are not needed.
> 
> Signed-off-by: Christoph Hellwig 

Acked-by: Catalin Marinas 
___
iommu mailing list
iommu@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/iommu


Re: [PATCH 04/25] iommu/dma: Remove the flush_page callback

2019-05-03 Thread Catalin Marinas
On Fri, May 03, 2019 at 12:43:35PM +0100, Catalin Marinas wrote:
> On Tue, Apr 30, 2019 at 06:51:53AM -0400, Christoph Hellwig wrote:
> > We now have a arch_dma_prep_coherent architecture hook that is used
> > for the generic DMA remap allocator, and we should use the same
> > interface for the dma-iommu code.
> > 
> > Signed-off-by: Christoph Hellwig 
> > Reviewed-by: Robin Murphy 
> > ---
> >  arch/arm64/mm/dma-mapping.c | 8 +---
> >  drivers/iommu/dma-iommu.c   | 8 +++-
> >  include/linux/dma-iommu.h   | 3 +--
> >  3 files changed, 5 insertions(+), 14 deletions(-)
> > 
> > diff --git a/arch/arm64/mm/dma-mapping.c b/arch/arm64/mm/dma-mapping.c
> > index 674860e3e478..10a8852c8b6a 100644
> > --- a/arch/arm64/mm/dma-mapping.c
> > +++ b/arch/arm64/mm/dma-mapping.c
> > @@ -104,12 +104,6 @@ arch_initcall(arm64_dma_init);
> >  #include 
> >  #include 
> >  
> > -/* Thankfully, all cache ops are by VA so we can ignore phys here */
> > -static void flush_page(struct device *dev, const void *virt, phys_addr_t 
> > phys)
> > -{
> > -   __dma_flush_area(virt, PAGE_SIZE);
> > -}
> 
> Rather than removing, should this not become arch_dma_prep_coherent()?
> With patch 2 selecting the corresponding Kconfig option, I think with
> this patch you'd get a build error (haven't tried).

Ah, sorry, it was already there.

Acked-by: Catalin Marinas 
___
iommu mailing list
iommu@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/iommu


Re: [PATCH 06/25] iommu/dma: move the arm64 wrappers to common code

2019-05-03 Thread Catalin Marinas
On Tue, Apr 30, 2019 at 06:51:55AM -0400, Christoph Hellwig wrote:
> There is nothing really arm64 specific in the iommu_dma_ops
> implementation, so move it to dma-iommu.c and keep a lot of symbols
> self-contained.  Note the implementation does depend on the
> DMA_DIRECT_REMAP infrastructure for now, so we'll have to make the
> DMA_IOMMU support depend on it, but this will be relaxed soon.
> 
> Signed-off-by: Christoph Hellwig 
> Acked-by: Robin Murphy 

Acked-by: Catalin Marinas 
___
iommu mailing list
iommu@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/iommu


Re: [PATCH 04/25] iommu/dma: Remove the flush_page callback

2019-05-03 Thread Catalin Marinas
On Tue, Apr 30, 2019 at 06:51:53AM -0400, Christoph Hellwig wrote:
> We now have a arch_dma_prep_coherent architecture hook that is used
> for the generic DMA remap allocator, and we should use the same
> interface for the dma-iommu code.
> 
> Signed-off-by: Christoph Hellwig 
> Reviewed-by: Robin Murphy 
> ---
>  arch/arm64/mm/dma-mapping.c | 8 +---
>  drivers/iommu/dma-iommu.c   | 8 +++-
>  include/linux/dma-iommu.h   | 3 +--
>  3 files changed, 5 insertions(+), 14 deletions(-)
> 
> diff --git a/arch/arm64/mm/dma-mapping.c b/arch/arm64/mm/dma-mapping.c
> index 674860e3e478..10a8852c8b6a 100644
> --- a/arch/arm64/mm/dma-mapping.c
> +++ b/arch/arm64/mm/dma-mapping.c
> @@ -104,12 +104,6 @@ arch_initcall(arm64_dma_init);
>  #include 
>  #include 
>  
> -/* Thankfully, all cache ops are by VA so we can ignore phys here */
> -static void flush_page(struct device *dev, const void *virt, phys_addr_t 
> phys)
> -{
> - __dma_flush_area(virt, PAGE_SIZE);
> -}

Rather than removing, should this not become arch_dma_prep_coherent()?
With patch 2 selecting the corresponding Kconfig option, I think with
this patch you'd get a build error (haven't tried).

-- 
Catalin
___
iommu mailing list
iommu@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/iommu


Re: [PATCH 02/25] dma-mapping: add a Kconfig symbol to indicated arch_dma_prep_coherent presence

2019-05-03 Thread Catalin Marinas
On Tue, Apr 30, 2019 at 06:51:51AM -0400, Christoph Hellwig wrote:
> Add a Kconfig symbol that indicates an architecture provides a
> arch_dma_prep_coherent implementation, and provide a stub otherwise.
> 
> This will allow the generic dma-iommu code to use it while still
> allowing to be built for cache coherent architectures.
> 
> Signed-off-by: Christoph Hellwig 
> Reviewed-by: Robin Murphy 

Acked-by: Catalin Marinas 
___
iommu mailing list
iommu@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/iommu


Re: [PATCH 01/25] arm64/iommu: handle non-remapped addresses in ->mmap and ->get_sgtable

2019-05-03 Thread Catalin Marinas
On Tue, Apr 30, 2019 at 06:51:50AM -0400, Christoph Hellwig wrote:
> DMA allocations that can't sleep may return non-remapped addresses, but
> we do not properly handle them in the mmap and get_sgtable methods.
> Resolve non-vmalloc addresses using virt_to_page to handle this corner
> case.
> 
> Signed-off-by: Christoph Hellwig 
> Reviewed-by: Robin Murphy 

Acked-by: Catalin Marinas 
___
iommu mailing list
iommu@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/iommu


Re: [PATCH v5 2/3] iommu/dma: Reserve IOVA for PCIe inaccessible DMA address

2019-05-03 Thread Robin Murphy

On 03/05/2019 06:23, Srinath Mannam wrote:

Hi Robin, Lorenzo,

Thanks for review and guidance.
AFAIU, conclusion of discussion is, to return error if dma-ranges list
is not sorted.

So that, Can I send a new patch with below change to return error if
dma-ranges list is not sorted?

-static void iova_reserve_pci_windows(struct pci_dev *dev,
+static int iova_reserve_pci_windows(struct pci_dev *dev,
 struct iova_domain *iovad)
  {
 struct pci_host_bridge *bridge = pci_find_host_bridge(dev->bus);
@@ -227,11 +227,15 @@ static void iova_reserve_pci_windows(struct pci_dev *dev,
 resource_list_for_each_entry(window, >dma_ranges) {
 end = window->res->start - window->offset;
  resv_iova:
-   if (end - start) {
+   if (end > start) {
 lo = iova_pfn(iovad, start);
 hi = iova_pfn(iovad, end);
 reserve_iova(iovad, lo, hi);
+   } else {
+   dev_err(>dev, "Unsorted dma_ranges list\n");
+   return -EINVAL;
 }
+

Please provide your inputs if any more changes required. Thank you,


You also need to handle and return this error where 
iova_reserve_pci_windows() is called from iova_reserve_iommu_regions().


Robin.


Regards,
Srinath.

On Thu, May 2, 2019 at 7:45 PM Robin Murphy  wrote:


On 02/05/2019 14:06, Lorenzo Pieralisi wrote:

On Thu, May 02, 2019 at 12:27:02PM +0100, Robin Murphy wrote:

Hi Lorenzo,

On 02/05/2019 12:01, Lorenzo Pieralisi wrote:

On Wed, May 01, 2019 at 11:06:25PM +0530, Srinath Mannam wrote:

dma_ranges field of PCI host bridge structure has resource entries in
sorted order of address range given through dma-ranges DT property. This
list is the accessible DMA address range. So that this resource list will
be processed and reserve IOVA address to the inaccessible address holes in
the list.

This method is similar to PCI IO resources address ranges reserving in
IOMMU for each EP connected to host bridge.

Signed-off-by: Srinath Mannam 
Based-on-patch-by: Oza Pawandeep 
Reviewed-by: Oza Pawandeep 
Acked-by: Robin Murphy 
---
drivers/iommu/dma-iommu.c | 19 +++
1 file changed, 19 insertions(+)

diff --git a/drivers/iommu/dma-iommu.c b/drivers/iommu/dma-iommu.c
index 77aabe6..da94844 100644
--- a/drivers/iommu/dma-iommu.c
+++ b/drivers/iommu/dma-iommu.c
@@ -212,6 +212,7 @@ static void iova_reserve_pci_windows(struct pci_dev *dev,
struct pci_host_bridge *bridge = pci_find_host_bridge(dev->bus);
struct resource_entry *window;
unsigned long lo, hi;
+  phys_addr_t start = 0, end;
resource_list_for_each_entry(window, >windows) {
if (resource_type(window->res) != IORESOURCE_MEM)
@@ -221,6 +222,24 @@ static void iova_reserve_pci_windows(struct pci_dev *dev,
hi = iova_pfn(iovad, window->res->end - window->offset);
reserve_iova(iovad, lo, hi);
}
+
+  /* Get reserved DMA windows from host bridge */
+  resource_list_for_each_entry(window, >dma_ranges) {


If this list is not sorted it seems to me the logic in this loop is
broken and you can't rely on callers to sort it because it is not a
written requirement and it is not enforced (you know because you
wrote the code but any other developer is not supposed to guess
it).

Can't we rewrite this loop so that it does not rely on list
entries order ?


The original idea was that callers should be required to provide a sorted
list, since it keeps things nice and simple...


I understand, if it was self-contained in driver code that would be fine
but in core code with possible multiple consumers this must be
documented/enforced, somehow.


I won't merge this series unless you sort it, no pun intended.

Lorenzo


+  end = window->res->start - window->offset;


...so would you consider it sufficient to add

  if (end < start)
  dev_err(...);


We should also revert any IOVA reservation we did prior to this
error, right ?


I think it would be enough to propagate an error code back out through
iommu_dma_init_domain(), which should then end up aborting the whole
IOMMU setup - reserve_iova() isn't really designed to be undoable, but
since this is the kind of error that should only ever be hit during
driver or DT development, as long as we continue booting such that the
developer can clearly see what's gone wrong, I don't think we need
bother spending too much effort tidying up inside the unused domain.


Anyway, I think it is best to ensure it *is* sorted.


here, plus commenting the definition of pci_host_bridge::dma_ranges
that it must be sorted in ascending order?


I don't think that commenting dma_ranges would help much, I am more
keen on making it work by construction.


[ I guess it might even make sense to factor out the parsing and list
construction from patch #3 into an 

Re: [PATCH v5 2/3] iommu/dma: Reserve IOVA for PCIe inaccessible DMA address

2019-05-03 Thread Srinath Mannam via iommu
Hi Robin,


On Fri, May 3, 2019 at 3:58 PM Robin Murphy  wrote:
>
> On 03/05/2019 06:23, Srinath Mannam wrote:
> > Hi Robin, Lorenzo,
> >
> > Thanks for review and guidance.
> > AFAIU, conclusion of discussion is, to return error if dma-ranges list
> > is not sorted.
> >
> > So that, Can I send a new patch with below change to return error if
> > dma-ranges list is not sorted?
> >
> > -static void iova_reserve_pci_windows(struct pci_dev *dev,
> > +static int iova_reserve_pci_windows(struct pci_dev *dev,
> >  struct iova_domain *iovad)
> >   {
> >  struct pci_host_bridge *bridge = pci_find_host_bridge(dev->bus);
> > @@ -227,11 +227,15 @@ static void iova_reserve_pci_windows(struct pci_dev 
> > *dev,
> >  resource_list_for_each_entry(window, >dma_ranges) {
> >  end = window->res->start - window->offset;
> >   resv_iova:
> > -   if (end - start) {
> > +   if (end > start) {
> >  lo = iova_pfn(iovad, start);
> >  hi = iova_pfn(iovad, end);
> >  reserve_iova(iovad, lo, hi);
> > +   } else {
> > +   dev_err(>dev, "Unsorted dma_ranges list\n");
> > +   return -EINVAL;
> >  }
> > +
> >
> > Please provide your inputs if any more changes required. Thank you,
>
> You also need to handle and return this error where
> iova_reserve_pci_windows() is called from iova_reserve_iommu_regions().
Thank you. I am doing this.

Regards,
Srinath.
>
> Robin.
>
> > Regards,
> > Srinath.
> >
> > On Thu, May 2, 2019 at 7:45 PM Robin Murphy  wrote:
> >>
> >> On 02/05/2019 14:06, Lorenzo Pieralisi wrote:
> >>> On Thu, May 02, 2019 at 12:27:02PM +0100, Robin Murphy wrote:
>  Hi Lorenzo,
> 
>  On 02/05/2019 12:01, Lorenzo Pieralisi wrote:
> > On Wed, May 01, 2019 at 11:06:25PM +0530, Srinath Mannam wrote:
> >> dma_ranges field of PCI host bridge structure has resource entries in
> >> sorted order of address range given through dma-ranges DT property. 
> >> This
> >> list is the accessible DMA address range. So that this resource list 
> >> will
> >> be processed and reserve IOVA address to the inaccessible address 
> >> holes in
> >> the list.
> >>
> >> This method is similar to PCI IO resources address ranges reserving in
> >> IOMMU for each EP connected to host bridge.
> >>
> >> Signed-off-by: Srinath Mannam 
> >> Based-on-patch-by: Oza Pawandeep 
> >> Reviewed-by: Oza Pawandeep 
> >> Acked-by: Robin Murphy 
> >> ---
> >> drivers/iommu/dma-iommu.c | 19 +++
> >> 1 file changed, 19 insertions(+)
> >>
> >> diff --git a/drivers/iommu/dma-iommu.c b/drivers/iommu/dma-iommu.c
> >> index 77aabe6..da94844 100644
> >> --- a/drivers/iommu/dma-iommu.c
> >> +++ b/drivers/iommu/dma-iommu.c
> >> @@ -212,6 +212,7 @@ static void iova_reserve_pci_windows(struct 
> >> pci_dev *dev,
> >> struct pci_host_bridge *bridge = 
> >> pci_find_host_bridge(dev->bus);
> >> struct resource_entry *window;
> >> unsigned long lo, hi;
> >> +  phys_addr_t start = 0, end;
> >> resource_list_for_each_entry(window, >windows) {
> >> if (resource_type(window->res) != IORESOURCE_MEM)
> >> @@ -221,6 +222,24 @@ static void iova_reserve_pci_windows(struct 
> >> pci_dev *dev,
> >> hi = iova_pfn(iovad, window->res->end - 
> >> window->offset);
> >> reserve_iova(iovad, lo, hi);
> >> }
> >> +
> >> +  /* Get reserved DMA windows from host bridge */
> >> +  resource_list_for_each_entry(window, >dma_ranges) {
> >
> > If this list is not sorted it seems to me the logic in this loop is
> > broken and you can't rely on callers to sort it because it is not a
> > written requirement and it is not enforced (you know because you
> > wrote the code but any other developer is not supposed to guess
> > it).
> >
> > Can't we rewrite this loop so that it does not rely on list
> > entries order ?
> 
>  The original idea was that callers should be required to provide a sorted
>  list, since it keeps things nice and simple...
> >>>
> >>> I understand, if it was self-contained in driver code that would be fine
> >>> but in core code with possible multiple consumers this must be
> >>> documented/enforced, somehow.
> >>>
> > I won't merge this series unless you sort it, no pun intended.
> >
> > Lorenzo
> >
> >> +  end = window->res->start - window->offset;
> 
>  ...so would you consider it sufficient to add
> 
>    if (end < start)
>    dev_err(...);
> >>>
> >>> We should also revert any IOVA reservation we did prior to this
> >>> error, right ?
> >>

Re: [PATCH v5 2/3] iommu/dma: Reserve IOVA for PCIe inaccessible DMA address

2019-05-03 Thread Srinath Mannam via iommu
Hi Lorenzo,

Thank you so much, Please see my reply below.

On Fri, May 3, 2019 at 3:23 PM Lorenzo Pieralisi
 wrote:
>
> On Fri, May 03, 2019 at 10:53:23AM +0530, Srinath Mannam wrote:
> > Hi Robin, Lorenzo,
> >
> > Thanks for review and guidance.
> > AFAIU, conclusion of discussion is, to return error if dma-ranges list
> > is not sorted.
> >
> > So that, Can I send a new patch with below change to return error if
> > dma-ranges list is not sorted?
>
> You can but I can't guarantee it will make it for v5.2.
>
> We will have to move the DT parsing and dma list ranges creation
> to core code anyway because I want this to work by construction,
> so even if we manage to make v5.2 you will have to do that.
Yes, Later I will work on it and do required core code changes.
>
> I pushed a branch out:
>
> not-to-merge/iova-dma-ranges
>
> where I rewrote all commit logs and I am not willing to do it again
> so please use them for your v6 posting if you manage to make it
> today.
Thank you, I will take all commit log changes and push v6 version today.

Regards,
Srinath.
>
> Lorenzo
>
> > -static void iova_reserve_pci_windows(struct pci_dev *dev,
> > +static int iova_reserve_pci_windows(struct pci_dev *dev,
> > struct iova_domain *iovad)
> >  {
> > struct pci_host_bridge *bridge = pci_find_host_bridge(dev->bus);
> > @@ -227,11 +227,15 @@ static void iova_reserve_pci_windows(struct pci_dev 
> > *dev,
> > resource_list_for_each_entry(window, >dma_ranges) {
> > end = window->res->start - window->offset;
> >  resv_iova:
> > -   if (end - start) {
> > +   if (end > start) {
> > lo = iova_pfn(iovad, start);
> > hi = iova_pfn(iovad, end);
> > reserve_iova(iovad, lo, hi);
> > +   } else {
> > +   dev_err(>dev, "Unsorted dma_ranges list\n");
> > +   return -EINVAL;
> > }
> > +
> >
> > Please provide your inputs if any more changes required. Thank you,
> >
> > Regards,
> > Srinath.
> >
> > On Thu, May 2, 2019 at 7:45 PM Robin Murphy  wrote:
> > >
> > > On 02/05/2019 14:06, Lorenzo Pieralisi wrote:
> > > > On Thu, May 02, 2019 at 12:27:02PM +0100, Robin Murphy wrote:
> > > >> Hi Lorenzo,
> > > >>
> > > >> On 02/05/2019 12:01, Lorenzo Pieralisi wrote:
> > > >>> On Wed, May 01, 2019 at 11:06:25PM +0530, Srinath Mannam wrote:
> > >  dma_ranges field of PCI host bridge structure has resource entries in
> > >  sorted order of address range given through dma-ranges DT property. 
> > >  This
> > >  list is the accessible DMA address range. So that this resource list 
> > >  will
> > >  be processed and reserve IOVA address to the inaccessible address 
> > >  holes in
> > >  the list.
> > > 
> > >  This method is similar to PCI IO resources address ranges reserving 
> > >  in
> > >  IOMMU for each EP connected to host bridge.
> > > 
> > >  Signed-off-by: Srinath Mannam 
> > >  Based-on-patch-by: Oza Pawandeep 
> > >  Reviewed-by: Oza Pawandeep 
> > >  Acked-by: Robin Murphy 
> > >  ---
> > > drivers/iommu/dma-iommu.c | 19 +++
> > > 1 file changed, 19 insertions(+)
> > > 
> > >  diff --git a/drivers/iommu/dma-iommu.c b/drivers/iommu/dma-iommu.c
> > >  index 77aabe6..da94844 100644
> > >  --- a/drivers/iommu/dma-iommu.c
> > >  +++ b/drivers/iommu/dma-iommu.c
> > >  @@ -212,6 +212,7 @@ static void iova_reserve_pci_windows(struct 
> > >  pci_dev *dev,
> > > struct pci_host_bridge *bridge = 
> > >  pci_find_host_bridge(dev->bus);
> > > struct resource_entry *window;
> > > unsigned long lo, hi;
> > >  +  phys_addr_t start = 0, end;
> > > resource_list_for_each_entry(window, >windows) {
> > > if (resource_type(window->res) != IORESOURCE_MEM)
> > >  @@ -221,6 +222,24 @@ static void iova_reserve_pci_windows(struct 
> > >  pci_dev *dev,
> > > hi = iova_pfn(iovad, window->res->end - 
> > >  window->offset);
> > > reserve_iova(iovad, lo, hi);
> > > }
> > >  +
> > >  +  /* Get reserved DMA windows from host bridge */
> > >  +  resource_list_for_each_entry(window, >dma_ranges) {
> > > >>>
> > > >>> If this list is not sorted it seems to me the logic in this loop is
> > > >>> broken and you can't rely on callers to sort it because it is not a
> > > >>> written requirement and it is not enforced (you know because you
> > > >>> wrote the code but any other developer is not supposed to guess
> > > >>> it).
> > > >>>
> > > >>> Can't we rewrite this loop so that it does not rely on list
> > > >>> entries order ?
> > > >>
> > > >> The original idea was that callers should be required to provide a 
> > > >> sorted
> > > >> list, since it 

Re: [PATCH v5 2/3] iommu/dma: Reserve IOVA for PCIe inaccessible DMA address

2019-05-03 Thread Lorenzo Pieralisi
On Fri, May 03, 2019 at 10:53:23AM +0530, Srinath Mannam wrote:
> Hi Robin, Lorenzo,
> 
> Thanks for review and guidance.
> AFAIU, conclusion of discussion is, to return error if dma-ranges list
> is not sorted.
> 
> So that, Can I send a new patch with below change to return error if
> dma-ranges list is not sorted?

You can but I can't guarantee it will make it for v5.2.

We will have to move the DT parsing and dma list ranges creation
to core code anyway because I want this to work by construction,
so even if we manage to make v5.2 you will have to do that.

I pushed a branch out:

not-to-merge/iova-dma-ranges

where I rewrote all commit logs and I am not willing to do it again
so please use them for your v6 posting if you manage to make it
today.

Lorenzo

> -static void iova_reserve_pci_windows(struct pci_dev *dev,
> +static int iova_reserve_pci_windows(struct pci_dev *dev,
> struct iova_domain *iovad)
>  {
> struct pci_host_bridge *bridge = pci_find_host_bridge(dev->bus);
> @@ -227,11 +227,15 @@ static void iova_reserve_pci_windows(struct pci_dev 
> *dev,
> resource_list_for_each_entry(window, >dma_ranges) {
> end = window->res->start - window->offset;
>  resv_iova:
> -   if (end - start) {
> +   if (end > start) {
> lo = iova_pfn(iovad, start);
> hi = iova_pfn(iovad, end);
> reserve_iova(iovad, lo, hi);
> +   } else {
> +   dev_err(>dev, "Unsorted dma_ranges list\n");
> +   return -EINVAL;
> }
> +
> 
> Please provide your inputs if any more changes required. Thank you,
> 
> Regards,
> Srinath.
> 
> On Thu, May 2, 2019 at 7:45 PM Robin Murphy  wrote:
> >
> > On 02/05/2019 14:06, Lorenzo Pieralisi wrote:
> > > On Thu, May 02, 2019 at 12:27:02PM +0100, Robin Murphy wrote:
> > >> Hi Lorenzo,
> > >>
> > >> On 02/05/2019 12:01, Lorenzo Pieralisi wrote:
> > >>> On Wed, May 01, 2019 at 11:06:25PM +0530, Srinath Mannam wrote:
> >  dma_ranges field of PCI host bridge structure has resource entries in
> >  sorted order of address range given through dma-ranges DT property. 
> >  This
> >  list is the accessible DMA address range. So that this resource list 
> >  will
> >  be processed and reserve IOVA address to the inaccessible address 
> >  holes in
> >  the list.
> > 
> >  This method is similar to PCI IO resources address ranges reserving in
> >  IOMMU for each EP connected to host bridge.
> > 
> >  Signed-off-by: Srinath Mannam 
> >  Based-on-patch-by: Oza Pawandeep 
> >  Reviewed-by: Oza Pawandeep 
> >  Acked-by: Robin Murphy 
> >  ---
> > drivers/iommu/dma-iommu.c | 19 +++
> > 1 file changed, 19 insertions(+)
> > 
> >  diff --git a/drivers/iommu/dma-iommu.c b/drivers/iommu/dma-iommu.c
> >  index 77aabe6..da94844 100644
> >  --- a/drivers/iommu/dma-iommu.c
> >  +++ b/drivers/iommu/dma-iommu.c
> >  @@ -212,6 +212,7 @@ static void iova_reserve_pci_windows(struct 
> >  pci_dev *dev,
> > struct pci_host_bridge *bridge = 
> >  pci_find_host_bridge(dev->bus);
> > struct resource_entry *window;
> > unsigned long lo, hi;
> >  +  phys_addr_t start = 0, end;
> > resource_list_for_each_entry(window, >windows) {
> > if (resource_type(window->res) != IORESOURCE_MEM)
> >  @@ -221,6 +222,24 @@ static void iova_reserve_pci_windows(struct 
> >  pci_dev *dev,
> > hi = iova_pfn(iovad, window->res->end - 
> >  window->offset);
> > reserve_iova(iovad, lo, hi);
> > }
> >  +
> >  +  /* Get reserved DMA windows from host bridge */
> >  +  resource_list_for_each_entry(window, >dma_ranges) {
> > >>>
> > >>> If this list is not sorted it seems to me the logic in this loop is
> > >>> broken and you can't rely on callers to sort it because it is not a
> > >>> written requirement and it is not enforced (you know because you
> > >>> wrote the code but any other developer is not supposed to guess
> > >>> it).
> > >>>
> > >>> Can't we rewrite this loop so that it does not rely on list
> > >>> entries order ?
> > >>
> > >> The original idea was that callers should be required to provide a sorted
> > >> list, since it keeps things nice and simple...
> > >
> > > I understand, if it was self-contained in driver code that would be fine
> > > but in core code with possible multiple consumers this must be
> > > documented/enforced, somehow.
> > >
> > >>> I won't merge this series unless you sort it, no pun intended.
> > >>>
> > >>> Lorenzo
> > >>>
> >  +  end = window->res->start - window->offset;
> > >>
> > >> ...so would you consider it sufficient to add
> > >>
> > >>  if (end < start)
> > >>