[PATCH v2 6/7] iommu/vt-d: Add page request draining support

2020-04-14 Thread Lu Baolu
When a PASID is stopped or terminated, there can be pending
PRQs (requests that haven't received responses) in remapping
hardware. This adds the interface to drain page requests and
call it when a PASID is terminated.

Signed-off-by: Jacob Pan 
Signed-off-by: Liu Yi L 
Signed-off-by: Lu Baolu 
---
 drivers/iommu/intel-svm.c   | 90 ++---
 include/linux/intel-iommu.h |  1 +
 2 files changed, 86 insertions(+), 5 deletions(-)

diff --git a/drivers/iommu/intel-svm.c b/drivers/iommu/intel-svm.c
index 05aeb8ea51c4..736dd39fb52b 100644
--- a/drivers/iommu/intel-svm.c
+++ b/drivers/iommu/intel-svm.c
@@ -23,6 +23,7 @@
 #include "intel-pasid.h"
 
 static irqreturn_t prq_event_thread(int irq, void *d);
+static void intel_svm_drain_prq(struct device *dev, int pasid);
 
 #define PRQ_ORDER 0
 
@@ -210,6 +211,7 @@ static void intel_mm_release(struct mmu_notifier *mn, 
struct mm_struct *mm)
rcu_read_lock();
list_for_each_entry_rcu(sdev, >devs, list) {
intel_pasid_tear_down_entry(svm->iommu, sdev->dev, svm->pasid);
+   intel_svm_drain_prq(sdev->dev, svm->pasid);
intel_flush_svm_range_dev(svm, sdev, 0, -1, 0);
}
rcu_read_unlock();
@@ -403,12 +405,8 @@ int intel_svm_unbind_gpasid(struct device *dev, int pasid)
if (!sdev->users) {
list_del_rcu(>list);
intel_pasid_tear_down_entry(iommu, dev, svm->pasid);
+   intel_svm_drain_prq(dev, svm->pasid);
intel_flush_svm_range_dev(svm, sdev, 0, -1, 0);
-   /* TODO: Drain in flight PRQ for the PASID since it
-* may get reused soon, we don't want to
-* confuse with its previous life.
-* intel_svm_drain_prq(dev, pasid);
-*/
kfree_rcu(sdev, rcu);
 
if (list_empty(>devs)) {
@@ -646,6 +644,7 @@ int intel_svm_unbind_mm(struct device *dev, int pasid)
 * large and has to be physically contiguous. So it's
 * hard to be as defensive as we might like. */
intel_pasid_tear_down_entry(iommu, dev, svm->pasid);
+   intel_svm_drain_prq(dev, svm->pasid);
intel_flush_svm_range_dev(svm, sdev, 0, -1, 0);
kfree_rcu(sdev, rcu);
 
@@ -703,6 +702,7 @@ struct page_req_dsc {
 struct page_req {
struct list_head list;
struct page_req_dsc desc;
+   struct completion complete;
unsigned int processing:1;
unsigned int drained:1;
unsigned int completed:1;
@@ -732,9 +732,83 @@ static bool is_canonical_address(u64 addr)
return (((saddr << shift) >> shift) == saddr);
 }
 
+/**
+ * intel_svm_drain_prq:
+ *
+ * Drain all pending page requests related to a specific pasid in both
+ * software and hardware. The caller must guarantee that no more page
+ * requests related to this pasid coming.
+ */
+static void intel_svm_drain_prq(struct device *dev, int pasid)
+{
+   struct device_domain_info *info;
+   struct dmar_domain *domain;
+   struct intel_iommu *iommu;
+   struct qi_desc desc[3];
+   struct pci_dev *pdev;
+   struct page_req *req;
+   unsigned long flags;
+   u16 sid, did;
+   int qdep;
+
+   info = get_domain_info(dev);
+   if (WARN_ON(!info || !dev_is_pci(dev)))
+   return;
+
+   iommu = info->iommu;
+   domain = info->domain;
+   pdev = to_pci_dev(dev);
+
+   /* Mark all related pending requests drained. */
+   spin_lock_irqsave(>prq_lock, flags);
+   list_for_each_entry(req, >prq_list, list)
+   if (req->desc.pasid_present && req->desc.pasid == pasid)
+   req->drained = true;
+   spin_unlock_irqrestore(>prq_lock, flags);
+
+   /* Wait until all related pending requests complete. */
+retry:
+   spin_lock_irqsave(>prq_lock, flags);
+   list_for_each_entry(req, >prq_list, list) {
+   if (req->desc.pasid_present &&
+   req->desc.pasid == pasid &&
+   !req->completed) {
+   spin_unlock_irqrestore(>prq_lock, flags);
+   wait_for_completion_timeout(>complete, 5 * HZ);
+   goto retry;
+   }
+   }
+   spin_unlock_irqrestore(>prq_lock, flags);
+
+   /*
+* Perform steps described in VT-d spec CH7.10 to drain page
+* request and responses in hardware.
+*/
+   sid = PCI_DEVID(info->bus, info->devfn);
+   did = domain->iommu_did[iommu->seq_id];
+   qdep = pci_ats_queue_depth(pdev);
+
+   memset(desc, 0, sizeof(desc));
+   desc[0].qw0 = QI_IWD_STATUS_DATA(QI_DONE) |
+   QI_IWD_FENCE |
+   QI_IWD_TYPE;
+   desc[1].qw0 = 

[PATCH v2 3/7] iommu/vt-d: debugfs: Add support to show inv queue internals

2020-04-14 Thread Lu Baolu
Export invalidation queue internals of each iommu device through
the debugfs.

Example of such dump on a Skylake machine:

$ sudo cat /sys/kernel/debug/iommu/intel/invalidation_queue
Invalidation queue on IOMMU: dmar1
 Base: 0x1672c9000  Head: 80Tail: 80
Index   qw0 qw1 status
0   0004
1   000200250001672be804
2   0011
3   000200250001672be80c
4   00d2
5   000200250001672be814
6   0014
7   000200250001672be81c
8   0014
9   000200250001672be824

Signed-off-by: Lu Baolu 
---
 drivers/iommu/intel-iommu-debugfs.c | 62 +
 1 file changed, 62 insertions(+)

diff --git a/drivers/iommu/intel-iommu-debugfs.c 
b/drivers/iommu/intel-iommu-debugfs.c
index 3eb1fe240fb0..e3089865b8f3 100644
--- a/drivers/iommu/intel-iommu-debugfs.c
+++ b/drivers/iommu/intel-iommu-debugfs.c
@@ -372,6 +372,66 @@ static int domain_translation_struct_show(struct seq_file 
*m, void *unused)
 }
 DEFINE_SHOW_ATTRIBUTE(domain_translation_struct);
 
+static void invalidation_queue_entry_show(struct seq_file *m,
+ struct intel_iommu *iommu)
+{
+   int index, shift = qi_shift(iommu);
+   struct qi_desc *desc;
+   int offset;
+
+   if (ecap_smts(iommu->ecap))
+   seq_puts(m, 
"Index\t\tqw0\t\t\tqw1\t\t\tqw2\t\t\tqw3\t\t\tstatus\n");
+   else
+   seq_puts(m, "Index\t\tqw0\t\t\tqw1\t\t\tstatus\n");
+
+   for (index = 0; index < QI_LENGTH; index++) {
+   offset = index << shift;
+   desc = iommu->qi->desc + offset;
+   if (ecap_smts(iommu->ecap))
+   seq_printf(m, 
"%5d\t%016llx\t%016llx\t%016llx\t%016llx\t%016x\n",
+  index, desc->qw0, desc->qw1,
+  desc->qw2, desc->qw3,
+  iommu->qi->desc_status[index]);
+   else
+   seq_printf(m, "%5d\t%016llx\t%016llx\t%016x\n",
+  index, desc->qw0, desc->qw1,
+  iommu->qi->desc_status[index]);
+   }
+}
+
+static int invalidation_queue_show(struct seq_file *m, void *unused)
+{
+   struct dmar_drhd_unit *drhd;
+   struct intel_iommu *iommu;
+   unsigned long flags;
+   struct q_inval *qi;
+   int shift;
+
+   rcu_read_lock();
+   for_each_active_iommu(iommu, drhd) {
+   qi = iommu->qi;
+   shift = qi_shift(iommu);
+
+   if (!qi || !ecap_qis(iommu->ecap))
+   continue;
+
+   seq_printf(m, "Invalidation queue on IOMMU: %s\n", iommu->name);
+
+   raw_spin_lock_irqsave(>q_lock, flags);
+   seq_printf(m, " Base: 0x%llx\tHead: %lld\tTail: %lld\n",
+  virt_to_phys(qi->desc),
+  dmar_readq(iommu->reg + DMAR_IQH_REG) >> shift,
+  dmar_readq(iommu->reg + DMAR_IQT_REG) >> shift);
+   invalidation_queue_entry_show(m, iommu);
+   raw_spin_unlock_irqrestore(>q_lock, flags);
+   seq_putc(m, '\n');
+   }
+   rcu_read_unlock();
+
+   return 0;
+}
+DEFINE_SHOW_ATTRIBUTE(invalidation_queue);
+
 #ifdef CONFIG_IRQ_REMAP
 static void ir_tbl_remap_entry_show(struct seq_file *m,
struct intel_iommu *iommu)
@@ -490,6 +550,8 @@ void __init intel_iommu_debugfs_init(void)
debugfs_create_file("domain_translation_struct", 0444,
intel_iommu_debug, NULL,
_translation_struct_fops);
+   debugfs_create_file("invalidation_queue", 0444, intel_iommu_debug,
+   NULL, _queue_fops);
 #ifdef CONFIG_IRQ_REMAP
debugfs_create_file("ir_translation_struct", 0444, intel_iommu_debug,
NULL, _translation_struct_fops);
-- 
2.17.1

___
iommu mailing list
iommu@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/iommu


[PATCH v2 5/7] iommu/vt-d: Save prq descriptors in an internal list

2020-04-14 Thread Lu Baolu
Currently, the page request interrupt thread handles the page
requests in the queue in this way:

- Clear PPR bit to ensure new interrupt could come in;
- Read and record the head and tail registers;
- Handle all descriptors between head and tail;
- Write tail to head register.

This might cause some descriptors to be handles multiple times.
An example sequence:

- Thread A got scheduled with PRQ_1 and PRQ_2 in the queue;
- Thread A clear the PPR bit and record the head and tail;
- A new PRQ_3 comes and Thread B gets scheduled;
- Thread B record the head and tail which includes PRQ_1
  and PRQ_2.

As the result, PRQ_1 and PRQ_2 are handled twice in Thread_A and
Thread_B.

   Thread_AThread_B
  ..  ..
  ||  ||
  ..  ..
  head| PRQ_1  |  head| PRQ_1  |
  ..  ..
  | PRQ_2  |  | PRQ_2  |
  ..  ..
  tail||  | PRQ_3  |
  ..  ..
  ||  tail||
  ''  ''

To avoid this, probably, we need to apply a spinlock to ensure
that PRQs are handled in a serialized way. But that means the
intel_svm_process_prq() will be called with a spinlock held.
This causes extra complexities in intel_svm_process_prq().

This aims to make PRQ descriptors to be handled in a serialized
way while remove the requirement of holding the spin lock in
intel_svm_process_prq() by saving the descriptors in a list.

Signed-off-by: Lu Baolu 
---
 drivers/iommu/intel-svm.c   | 58 ++---
 include/linux/intel-iommu.h |  2 ++
 2 files changed, 49 insertions(+), 11 deletions(-)

diff --git a/drivers/iommu/intel-svm.c b/drivers/iommu/intel-svm.c
index a1921b462783..05aeb8ea51c4 100644
--- a/drivers/iommu/intel-svm.c
+++ b/drivers/iommu/intel-svm.c
@@ -50,6 +50,8 @@ int intel_svm_enable_prq(struct intel_iommu *iommu)
return ret;
}
iommu->pr_irq = irq;
+   INIT_LIST_HEAD(>prq_list);
+   spin_lock_init(>prq_lock);
 
snprintf(iommu->prq_name, sizeof(iommu->prq_name), "dmar%d-prq", 
iommu->seq_id);
 
@@ -698,6 +700,14 @@ struct page_req_dsc {
 
 #define PRQ_RING_MASK  ((0x1000 << PRQ_ORDER) - 0x20)
 
+struct page_req {
+   struct list_head list;
+   struct page_req_dsc desc;
+   unsigned int processing:1;
+   unsigned int drained:1;
+   unsigned int completed:1;
+};
+
 static bool access_error(struct vm_area_struct *vma, struct page_req_dsc *req)
 {
unsigned long requested = 0;
@@ -842,34 +852,60 @@ static void process_single_prq(struct intel_iommu *iommu,
}
 }
 
-static void intel_svm_process_prq(struct intel_iommu *iommu,
- struct page_req_dsc *prq,
- int head, int tail)
+static void intel_svm_process_prq(struct intel_iommu *iommu)
 {
-   struct page_req_dsc *req;
-
-   while (head != tail) {
-   req = >prq[head / sizeof(*req)];
-   process_single_prq(iommu, req);
-   head = (head + sizeof(*req)) & PRQ_RING_MASK;
+   struct page_req *req;
+   unsigned long flags;
+
+   spin_lock_irqsave(>prq_lock, flags);
+   while (!list_empty(>prq_list)) {
+   req = list_first_entry(>prq_list, struct page_req, list);
+   if (!req->processing) {
+   req->processing = true;
+   spin_unlock_irqrestore(>prq_lock, flags);
+   process_single_prq(iommu, >desc);
+   spin_lock_irqsave(>prq_lock, flags);
+   req->completed = true;
+   } else if (req->completed) {
+   list_del(>list);
+   kfree(req);
+   } else {
+   break;
+   }
}
+   spin_unlock_irqrestore(>prq_lock, flags);
 }
 
 static irqreturn_t prq_event_thread(int irq, void *d)
 {
struct intel_iommu *iommu = d;
+   unsigned long flags;
int head, tail;
 
+   spin_lock_irqsave(>prq_lock, flags);
/*
 * Clear PPR bit before reading head/tail registers, to
 * ensure that we get a new interrupt if needed.
 */
writel(DMA_PRS_PPR, iommu->reg + DMAR_PRS_REG);
-
tail = dmar_readq(iommu->reg + DMAR_PQT_REG) & PRQ_RING_MASK;
head = dmar_readq(iommu->reg + DMAR_PQH_REG) & PRQ_RING_MASK;
-   intel_svm_process_prq(iommu, iommu->prq, head, tail);
+   while (head != tail) {
+   struct page_req_dsc *dsc;
+   struct page_req *req;
+
+   dsc = >prq[head / sizeof(*dsc)];
+   req = kzalloc(sizeof (*req), GFP_ATOMIC);
+   if (!req)
+   break;
+   req->desc = *dsc;
+   list_add_tail(>list, >prq_list);
+   head = (head + 

[PATCH v2 1/7] iommu/vt-d: Refactor parameters for qi_submit_sync()

2020-04-14 Thread Lu Baolu
Current qi_submit_sync() supports single invalidation descriptor
per submission and appends wait descriptor after each submission
to poll hardware completion. This patch adjusts the parameters
of this function so that multiple descriptors per submission can
be supported.

Signed-off-by: Jacob Pan 
Signed-off-by: Lu Baolu 
---
 drivers/iommu/dmar.c| 24 ++--
 drivers/iommu/intel-pasid.c |  4 ++--
 drivers/iommu/intel-svm.c   |  6 +++---
 drivers/iommu/intel_irq_remapping.c |  2 +-
 include/linux/intel-iommu.h |  8 +++-
 5 files changed, 27 insertions(+), 17 deletions(-)

diff --git a/drivers/iommu/dmar.c b/drivers/iommu/dmar.c
index d9dc787feef7..bb42177e2369 100644
--- a/drivers/iommu/dmar.c
+++ b/drivers/iommu/dmar.c
@@ -1225,10 +1225,14 @@ static int qi_check_fault(struct intel_iommu *iommu, 
int index)
 }
 
 /*
- * Submit the queued invalidation descriptor to the remapping
- * hardware unit and wait for its completion.
+ * Function to submit invalidation descriptors of all types to the queued
+ * invalidation interface(QI). Multiple descriptors can be submitted at a
+ * time, a wait descriptor will be appended to each submission to ensure
+ * hardware has completed the invalidation before return. Wait descriptors
+ * can be part of the submission but it will not be polled for completion.
  */
-int qi_submit_sync(struct qi_desc *desc, struct intel_iommu *iommu)
+int qi_submit_sync(struct intel_iommu *iommu, struct qi_desc *desc,
+  unsigned int count, unsigned long options)
 {
int rc;
struct q_inval *qi = iommu->qi;
@@ -1318,7 +1322,7 @@ void qi_global_iec(struct intel_iommu *iommu)
desc.qw3 = 0;
 
/* should never fail */
-   qi_submit_sync(, iommu);
+   qi_submit_sync(iommu, , 1, 0);
 }
 
 void qi_flush_context(struct intel_iommu *iommu, u16 did, u16 sid, u8 fm,
@@ -1332,7 +1336,7 @@ void qi_flush_context(struct intel_iommu *iommu, u16 did, 
u16 sid, u8 fm,
desc.qw2 = 0;
desc.qw3 = 0;
 
-   qi_submit_sync(, iommu);
+   qi_submit_sync(iommu, , 1, 0);
 }
 
 void qi_flush_iotlb(struct intel_iommu *iommu, u16 did, u64 addr,
@@ -1356,7 +1360,7 @@ void qi_flush_iotlb(struct intel_iommu *iommu, u16 did, 
u64 addr,
desc.qw2 = 0;
desc.qw3 = 0;
 
-   qi_submit_sync(, iommu);
+   qi_submit_sync(iommu, , 1, 0);
 }
 
 void qi_flush_dev_iotlb(struct intel_iommu *iommu, u16 sid, u16 pfsid,
@@ -1378,7 +1382,7 @@ void qi_flush_dev_iotlb(struct intel_iommu *iommu, u16 
sid, u16 pfsid,
desc.qw2 = 0;
desc.qw3 = 0;
 
-   qi_submit_sync(, iommu);
+   qi_submit_sync(iommu, , 1, 0);
 }
 
 /* PASID-based IOTLB invalidation */
@@ -1419,7 +1423,7 @@ void qi_flush_piotlb(struct intel_iommu *iommu, u16 did, 
u32 pasid, u64 addr,
QI_EIOTLB_AM(mask);
}
 
-   qi_submit_sync(, iommu);
+   qi_submit_sync(iommu, , 1, 0);
 }
 
 /* PASID-based device IOTLB Invalidate */
@@ -1448,7 +1452,7 @@ void qi_flush_dev_iotlb_pasid(struct intel_iommu *iommu, 
u16 sid, u16 pfsid,
if (size_order)
desc.qw1 |= QI_DEV_EIOTLB_SIZE;
 
-   qi_submit_sync(, iommu);
+   qi_submit_sync(iommu, , 1, 0);
 }
 
 void qi_flush_pasid_cache(struct intel_iommu *iommu, u16 did,
@@ -1458,7 +1462,7 @@ void qi_flush_pasid_cache(struct intel_iommu *iommu, u16 
did,
 
desc.qw0 = QI_PC_PASID(pasid) | QI_PC_DID(did) |
QI_PC_GRAN(granu) | QI_PC_TYPE;
-   qi_submit_sync(, iommu);
+   qi_submit_sync(iommu, , 1, 0);
 }
 
 /*
diff --git a/drivers/iommu/intel-pasid.c b/drivers/iommu/intel-pasid.c
index 48cc9ca5f3dc..7969e3dac2ad 100644
--- a/drivers/iommu/intel-pasid.c
+++ b/drivers/iommu/intel-pasid.c
@@ -498,7 +498,7 @@ pasid_cache_invalidation_with_pasid(struct intel_iommu 
*iommu,
desc.qw2 = 0;
desc.qw3 = 0;
 
-   qi_submit_sync(, iommu);
+   qi_submit_sync(iommu, , 1, 0);
 }
 
 static void
@@ -512,7 +512,7 @@ iotlb_invalidation_with_pasid(struct intel_iommu *iommu, 
u16 did, u32 pasid)
desc.qw2 = 0;
desc.qw3 = 0;
 
-   qi_submit_sync(, iommu);
+   qi_submit_sync(iommu, , 1, 0);
 }
 
 static void
diff --git a/drivers/iommu/intel-svm.c b/drivers/iommu/intel-svm.c
index e9f4e979a71f..83dc4319f661 100644
--- a/drivers/iommu/intel-svm.c
+++ b/drivers/iommu/intel-svm.c
@@ -138,7 +138,7 @@ static void intel_flush_svm_range_dev (struct intel_svm 
*svm, struct intel_svm_d
}
desc.qw2 = 0;
desc.qw3 = 0;
-   qi_submit_sync(, svm->iommu);
+   qi_submit_sync(svm->iommu, , 1, 0);
 
if (sdev->dev_iotlb) {
desc.qw0 = QI_DEV_EIOTLB_PASID(svm->pasid) |
@@ -162,7 +162,7 @@ static void intel_flush_svm_range_dev (struct intel_svm 
*svm, struct intel_svm_d
}
desc.qw2 = 0;
desc.qw3 = 0;
-   qi_submit_sync(, svm->iommu);
+   

[PATCH v2 2/7] iommu/vt-d: Multiple descriptors per qi_submit_sync()

2020-04-14 Thread Lu Baolu
Extend qi_submit_sync() function to support multiple descriptors.

Signed-off-by: Jacob Pan 
Signed-off-by: Lu Baolu 
---
 drivers/iommu/dmar.c| 39 +++--
 include/linux/intel-iommu.h |  1 +
 2 files changed, 25 insertions(+), 15 deletions(-)

diff --git a/drivers/iommu/dmar.c b/drivers/iommu/dmar.c
index bb42177e2369..61d049e91f84 100644
--- a/drivers/iommu/dmar.c
+++ b/drivers/iommu/dmar.c
@@ -1157,12 +1157,11 @@ static inline void reclaim_free_desc(struct q_inval *qi)
}
 }
 
-static int qi_check_fault(struct intel_iommu *iommu, int index)
+static int qi_check_fault(struct intel_iommu *iommu, int index, int wait_index)
 {
u32 fault;
int head, tail;
struct q_inval *qi = iommu->qi;
-   int wait_index = (index + 1) % QI_LENGTH;
int shift = qi_shift(iommu);
 
if (qi->desc_status[wait_index] == QI_ABORT)
@@ -1234,12 +1233,12 @@ static int qi_check_fault(struct intel_iommu *iommu, 
int index)
 int qi_submit_sync(struct intel_iommu *iommu, struct qi_desc *desc,
   unsigned int count, unsigned long options)
 {
-   int rc;
struct q_inval *qi = iommu->qi;
-   int offset, shift, length;
struct qi_desc wait_desc;
int wait_index, index;
unsigned long flags;
+   int offset, shift;
+   int rc, i;
 
if (!qi)
return 0;
@@ -1248,32 +1247,41 @@ int qi_submit_sync(struct intel_iommu *iommu, struct 
qi_desc *desc,
rc = 0;
 
raw_spin_lock_irqsave(>q_lock, flags);
-   while (qi->free_cnt < 3) {
+   /*
+* Check if we have enough empty slots in the queue to submit,
+* the calculation is based on:
+* # of desc + 1 wait desc + 1 space between head and tail
+*/
+   while (qi->free_cnt < count + 2) {
raw_spin_unlock_irqrestore(>q_lock, flags);
cpu_relax();
raw_spin_lock_irqsave(>q_lock, flags);
}
 
index = qi->free_head;
-   wait_index = (index + 1) % QI_LENGTH;
+   wait_index = (index + count) % QI_LENGTH;
shift = qi_shift(iommu);
-   length = 1 << shift;
 
-   qi->desc_status[index] = qi->desc_status[wait_index] = QI_IN_USE;
+   for (i = 0; i < count; i++) {
+   offset = ((index + i) % QI_LENGTH) << shift;
+   memcpy(qi->desc + offset, [i], 1 << shift);
+   qi->desc_status[(index + i) % QI_LENGTH] = QI_IN_USE;
+   }
+   qi->desc_status[wait_index] = QI_IN_USE;
 
-   offset = index << shift;
-   memcpy(qi->desc + offset, desc, length);
wait_desc.qw0 = QI_IWD_STATUS_DATA(QI_DONE) |
QI_IWD_STATUS_WRITE | QI_IWD_TYPE;
+   if (options & QI_OPT_WAIT_DRAIN)
+   wait_desc.qw0 |= QI_IWD_PRQ_DRAIN;
wait_desc.qw1 = virt_to_phys(>desc_status[wait_index]);
wait_desc.qw2 = 0;
wait_desc.qw3 = 0;
 
offset = wait_index << shift;
-   memcpy(qi->desc + offset, _desc, length);
+   memcpy(qi->desc + offset, _desc, 1 << shift);
 
-   qi->free_head = (qi->free_head + 2) % QI_LENGTH;
-   qi->free_cnt -= 2;
+   qi->free_head = (qi->free_head + count + 1) % QI_LENGTH;
+   qi->free_cnt -= count + 1;
 
/*
 * update the HW tail register indicating the presence of
@@ -1289,7 +1297,7 @@ int qi_submit_sync(struct intel_iommu *iommu, struct 
qi_desc *desc,
 * a deadlock where the interrupt context can wait indefinitely
 * for free slots in the queue.
 */
-   rc = qi_check_fault(iommu, index);
+   rc = qi_check_fault(iommu, index, wait_index);
if (rc)
break;
 
@@ -1298,7 +1306,8 @@ int qi_submit_sync(struct intel_iommu *iommu, struct 
qi_desc *desc,
raw_spin_lock(>q_lock);
}
 
-   qi->desc_status[index] = QI_DONE;
+   for (i = 0; i < count; i++)
+   qi->desc_status[(index + i) % QI_LENGTH] = QI_DONE;
 
reclaim_free_desc(qi);
raw_spin_unlock_irqrestore(>q_lock, flags);
diff --git a/include/linux/intel-iommu.h b/include/linux/intel-iommu.h
index ee2d5cdd8339..cca1e5f9aeaa 100644
--- a/include/linux/intel-iommu.h
+++ b/include/linux/intel-iommu.h
@@ -333,6 +333,7 @@ enum {
 
 #define QI_IWD_STATUS_DATA(d)  (((u64)d) << 32)
 #define QI_IWD_STATUS_WRITE(((u64)1) << 5)
+#define QI_IWD_PRQ_DRAIN   (((u64)1) << 7)
 
 #define QI_IOTLB_DID(did)  (((u64)did) << 16)
 #define QI_IOTLB_DR(dr)(((u64)dr) << 7)
-- 
2.17.1

___
iommu mailing list
iommu@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/iommu


[PATCH v2 4/7] iommu/vt-d: Refactor prq_event_thread()

2020-04-14 Thread Lu Baolu
Move the software processing page request descriptors part from
prq_event_thread() into a separated function. No any functional
changes.

Signed-off-by: Lu Baolu 
---
 drivers/iommu/intel-svm.c | 256 --
 1 file changed, 135 insertions(+), 121 deletions(-)

diff --git a/drivers/iommu/intel-svm.c b/drivers/iommu/intel-svm.c
index 83dc4319f661..a1921b462783 100644
--- a/drivers/iommu/intel-svm.c
+++ b/drivers/iommu/intel-svm.c
@@ -722,142 +722,156 @@ static bool is_canonical_address(u64 addr)
return (((saddr << shift) >> shift) == saddr);
 }
 
-static irqreturn_t prq_event_thread(int irq, void *d)
+static void process_single_prq(struct intel_iommu *iommu,
+  struct page_req_dsc *req)
 {
-   struct intel_iommu *iommu = d;
-   struct intel_svm *svm = NULL;
-   int head, tail, handled = 0;
-
-   /* Clear PPR bit before reading head/tail registers, to
-* ensure that we get a new interrupt if needed. */
-   writel(DMA_PRS_PPR, iommu->reg + DMAR_PRS_REG);
-
-   tail = dmar_readq(iommu->reg + DMAR_PQT_REG) & PRQ_RING_MASK;
-   head = dmar_readq(iommu->reg + DMAR_PQH_REG) & PRQ_RING_MASK;
-   while (head != tail) {
-   struct intel_svm_dev *sdev;
-   struct vm_area_struct *vma;
-   struct page_req_dsc *req;
-   struct qi_desc resp;
-   int result;
-   vm_fault_t ret;
-   u64 address;
-
-   handled = 1;
-
-   req = >prq[head / sizeof(*req)];
+   int result = QI_RESP_FAILURE;
+   struct intel_svm_dev *sdev;
+   struct vm_area_struct *vma;
+   struct intel_svm *svm;
+   struct qi_desc resp;
+   vm_fault_t ret;
+   u64 address;
+
+   address = (u64)req->addr << VTD_PAGE_SHIFT;
+   if (!req->pasid_present) {
+   pr_err("%s: Page request without PASID: %08llx %08llx\n",
+  iommu->name, ((unsigned long long *)req)[0],
+  ((unsigned long long *)req)[1]);
+   goto no_pasid;
+   }
 
-   result = QI_RESP_FAILURE;
-   address = (u64)req->addr << VTD_PAGE_SHIFT;
-   if (!req->pasid_present) {
-   pr_err("%s: Page request without PASID: %08llx 
%08llx\n",
-  iommu->name, ((unsigned long long *)req)[0],
-  ((unsigned long long *)req)[1]);
-   goto no_pasid;
-   }
+   rcu_read_lock();
+   svm = ioasid_find(NULL, req->pasid, NULL);
+   /*
+* It *can't* go away, because the driver is not permitted
+* to unbind the mm while any page faults are outstanding.
+* So we only need RCU to protect the internal idr code.
+*/
+   rcu_read_unlock();
 
-   if (!svm || svm->pasid != req->pasid) {
-   rcu_read_lock();
-   svm = ioasid_find(NULL, req->pasid, NULL);
-   /* It *can't* go away, because the driver is not 
permitted
-* to unbind the mm while any page faults are 
outstanding.
-* So we only need RCU to protect the internal idr 
code. */
-   rcu_read_unlock();
-   if (IS_ERR_OR_NULL(svm)) {
-   pr_err("%s: Page request for invalid PASID %d: 
%08llx %08llx\n",
-  iommu->name, req->pasid, ((unsigned long 
long *)req)[0],
-  ((unsigned long long *)req)[1]);
-   goto no_pasid;
-   }
-   }
+   if (IS_ERR_OR_NULL(svm)) {
+   pr_err("%s: Page request for invalid PASID %d: %08llx %08llx\n",
+  iommu->name, req->pasid, ((unsigned long long *)req)[0],
+  ((unsigned long long *)req)[1]);
+   goto no_pasid;
+   }
 
-   result = QI_RESP_INVALID;
-   /* Since we're using init_mm.pgd directly, we should never take
-* any faults on kernel addresses. */
-   if (!svm->mm)
-   goto bad_req;
+   result = QI_RESP_INVALID;
+   /* Since we're using init_mm.pgd directly, we should never take
+* any faults on kernel addresses. */
+   if (!svm->mm)
+   goto bad_req;
+
+   /* If address is not canonical, return invalid response */
+   if (!is_canonical_address(address))
+   goto bad_req;
+
+   /* If the mm is already defunct, don't handle faults. */
+   if (!mmget_not_zero(svm->mm))
+   goto bad_req;
+
+   down_read(>mm->mmap_sem);
+   vma = find_extend_vma(svm->mm, address);
+   if (!vma || address < vma->vm_start)
+   goto invalid;
+
+   if (access_error(vma, req))
+   goto invalid;
+
+   

[PATCH v2 7/7] iommu/vt-d: Remove redundant IOTLB flush

2020-04-14 Thread Lu Baolu
IOTLB flush already included in the PASID tear down and the
page request drain process. There is no need to flush again.

Signed-off-by: Jacob Pan 
Signed-off-by: Lu Baolu 
---
 drivers/iommu/intel-svm.c | 3 ---
 1 file changed, 3 deletions(-)

diff --git a/drivers/iommu/intel-svm.c b/drivers/iommu/intel-svm.c
index 736dd39fb52b..56e8d35225fc 100644
--- a/drivers/iommu/intel-svm.c
+++ b/drivers/iommu/intel-svm.c
@@ -212,7 +212,6 @@ static void intel_mm_release(struct mmu_notifier *mn, 
struct mm_struct *mm)
list_for_each_entry_rcu(sdev, >devs, list) {
intel_pasid_tear_down_entry(svm->iommu, sdev->dev, svm->pasid);
intel_svm_drain_prq(sdev->dev, svm->pasid);
-   intel_flush_svm_range_dev(svm, sdev, 0, -1, 0);
}
rcu_read_unlock();
 
@@ -406,7 +405,6 @@ int intel_svm_unbind_gpasid(struct device *dev, int pasid)
list_del_rcu(>list);
intel_pasid_tear_down_entry(iommu, dev, svm->pasid);
intel_svm_drain_prq(dev, svm->pasid);
-   intel_flush_svm_range_dev(svm, sdev, 0, -1, 0);
kfree_rcu(sdev, rcu);
 
if (list_empty(>devs)) {
@@ -645,7 +643,6 @@ int intel_svm_unbind_mm(struct device *dev, int pasid)
 * hard to be as defensive as we might like. */
intel_pasid_tear_down_entry(iommu, dev, svm->pasid);
intel_svm_drain_prq(dev, svm->pasid);
-   intel_flush_svm_range_dev(svm, sdev, 0, -1, 0);
kfree_rcu(sdev, rcu);
 
if (list_empty(>devs)) {
-- 
2.17.1

___
iommu mailing list
iommu@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/iommu


[PATCH v2 0/7] iommu/vt-d: Add page request draining support

2020-04-14 Thread Lu Baolu
When a PASID is stopped or terminated, there can be pending PRQs
(requests that haven't received responses) in the software and
remapping hardware. The pending page requests must be drained
so that the pasid could be reused. The register level interface
for page request draining is defined in 7.11 of the VT-d spec.
This series adds the support for page requests draining.

This includes two parts:
 - PATCH 1/7 ~ 3/7: refactor the qi_submit_sync() to support
   multiple descriptors per submission which will be used by
   PATCH 6/7.
 - PATCH 4/7 ~ 7/7: add page request drain support after a
   pasid entry is torn down due to an unbind operation.

Please help to review.

Best regards,
baolu

Change log:
 v1->v2:
  - Fix race between multiple prq handling threads

Lu Baolu (7):
  iommu/vt-d: Refactor parameters for qi_submit_sync()
  iommu/vt-d: Multiple descriptors per qi_submit_sync()
  iommu/vt-d: debugfs: Add support to show inv queue internals
  iommu/vt-d: Refactor prq_event_thread()
  iommu/vt-d: Save prq descriptors in an internal list
  iommu/vt-d: Add page request draining support
  iommu/vt-d: Remove redundant IOTLB flush

 drivers/iommu/dmar.c|  63 +++--
 drivers/iommu/intel-iommu-debugfs.c |  62 +
 drivers/iommu/intel-pasid.c |   4 +-
 drivers/iommu/intel-svm.c   | 383 ++--
 drivers/iommu/intel_irq_remapping.c |   2 +-
 include/linux/intel-iommu.h |  12 +-
 6 files changed, 369 insertions(+), 157 deletions(-)

-- 
2.17.1

___
iommu mailing list
iommu@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/iommu


Re: [PATCH 1/4] dma-mapping: move the remaining DMA API calls out of line

2020-04-14 Thread Alexey Kardashevskiy



On 14/04/2020 22:25, Christoph Hellwig wrote:
> For a long time the DMA API has been implemented inline in dma-mapping.h,
> but the function bodies can be quite large.  Move them all out of line.
> 
> Signed-off-by: Christoph Hellwig 
> ---
>  include/linux/dma-direct.h  |  58 +
>  include/linux/dma-mapping.h | 247 
>  kernel/dma/direct.c |   9 --
>  kernel/dma/mapping.c| 164 
>  4 files changed, 244 insertions(+), 234 deletions(-)
> 
> diff --git a/include/linux/dma-direct.h b/include/linux/dma-direct.h
> index 24b8684aa21d..da689ad5fffd 100644
> --- a/include/linux/dma-direct.h
> +++ b/include/linux/dma-direct.h
> @@ -85,4 +85,62 @@ int dma_direct_mmap(struct device *dev, struct 
> vm_area_struct *vma,
>   void *cpu_addr, dma_addr_t dma_addr, size_t size,
>   unsigned long attrs);
>  int dma_direct_supported(struct device *dev, u64 mask);
> +dma_addr_t dma_direct_map_page(struct device *dev, struct page *page,
> + unsigned long offset, size_t size, enum dma_data_direction dir,
> + unsigned long attrs);
> +int dma_direct_map_sg(struct device *dev, struct scatterlist *sgl, int nents,
> + enum dma_data_direction dir, unsigned long attrs);
> +dma_addr_t dma_direct_map_resource(struct device *dev, phys_addr_t paddr,
> + size_t size, enum dma_data_direction dir, unsigned long attrs);
> +
> +#if defined(CONFIG_ARCH_HAS_SYNC_DMA_FOR_DEVICE) || \
> +defined(CONFIG_SWIOTLB)
> +void dma_direct_sync_single_for_device(struct device *dev,
> + dma_addr_t addr, size_t size, enum dma_data_direction dir);
> +void dma_direct_sync_sg_for_device(struct device *dev,
> + struct scatterlist *sgl, int nents, enum dma_data_direction 
> dir);
> +#else
> +static inline void dma_direct_sync_single_for_device(struct device *dev,
> + dma_addr_t addr, size_t size, enum dma_data_direction dir)
> +{
> +}
> +static inline void dma_direct_sync_sg_for_device(struct device *dev,
> + struct scatterlist *sgl, int nents, enum dma_data_direction dir)
> +{
> +}
> +#endif
> +
> +#if defined(CONFIG_ARCH_HAS_SYNC_DMA_FOR_CPU) || \
> +defined(CONFIG_ARCH_HAS_SYNC_DMA_FOR_CPU_ALL) || \
> +defined(CONFIG_SWIOTLB)
> +void dma_direct_unmap_page(struct device *dev, dma_addr_t addr,
> + size_t size, enum dma_data_direction dir, unsigned long attrs);
> +void dma_direct_unmap_sg(struct device *dev, struct scatterlist *sgl,
> + int nents, enum dma_data_direction dir, unsigned long attrs);
> +void dma_direct_sync_single_for_cpu(struct device *dev,
> + dma_addr_t addr, size_t size, enum dma_data_direction dir);
> +void dma_direct_sync_sg_for_cpu(struct device *dev,
> + struct scatterlist *sgl, int nents, enum dma_data_direction 
> dir);
> +#else
> +static inline void dma_direct_unmap_page(struct device *dev, dma_addr_t addr,
> + size_t size, enum dma_data_direction dir, unsigned long attrs)
> +{
> +}
> +static inline void dma_direct_unmap_sg(struct device *dev,
> + struct scatterlist *sgl, int nents, enum dma_data_direction dir,
> + unsigned long attrs)
> +{
> +}
> +static inline void dma_direct_sync_single_for_cpu(struct device *dev,
> + dma_addr_t addr, size_t size, enum dma_data_direction dir)
> +{
> +}
> +static inline void dma_direct_sync_sg_for_cpu(struct device *dev,
> + struct scatterlist *sgl, int nents, enum dma_data_direction dir)
> +{
> +}
> +#endif
> +
> +size_t dma_direct_max_mapping_size(struct device *dev);
> +
>  #endif /* _LINUX_DMA_DIRECT_H */
> diff --git a/include/linux/dma-mapping.h b/include/linux/dma-mapping.h
> index 330ad58fbf4d..793ad775cd54 100644
> --- a/include/linux/dma-mapping.h
> +++ b/include/linux/dma-mapping.h
> @@ -188,73 +188,6 @@ static inline int dma_mmap_from_global_coherent(struct 
> vm_area_struct *vma,
>  }
>  #endif /* CONFIG_DMA_DECLARE_COHERENT */
>  
> -static inline bool dma_is_direct(const struct dma_map_ops *ops)
> -{
> - return likely(!ops);
> -}
> -
> -/*
> - * All the dma_direct_* declarations are here just for the indirect call 
> bypass,
> - * and must not be used directly drivers!
> - */
> -dma_addr_t dma_direct_map_page(struct device *dev, struct page *page,
> - unsigned long offset, size_t size, enum dma_data_direction dir,
> - unsigned long attrs);
> -int dma_direct_map_sg(struct device *dev, struct scatterlist *sgl, int nents,
> - enum dma_data_direction dir, unsigned long attrs);
> -dma_addr_t dma_direct_map_resource(struct device *dev, phys_addr_t paddr,
> - size_t size, enum dma_data_direction dir, unsigned long attrs);
> -
> -#if defined(CONFIG_ARCH_HAS_SYNC_DMA_FOR_DEVICE) || \
> -defined(CONFIG_SWIOTLB)
> -void dma_direct_sync_single_for_device(struct device *dev,
> - dma_addr_t addr, size_t size, enum 

RE: [PATCH] dt-bndings: iommu: renesas, ipmmu-vmsa: convert to json-schema

2020-04-14 Thread Yoshihiro Shimoda
Hi Robin,

> From: Robin Murphy, Sent: Wednesday, April 15, 2020 2:16 AM
> 
> On 2020-04-13 11:25 am, Yoshihiro Shimoda wrote:
> [...]
> > -Each bus master connected to an IPMMU must reference the IPMMU in its 
> > device
> > -node with the following property:
> > -
> > -  - iommus: A reference to the IPMMU in two cells. The first cell is a 
> > phandle
> > -to the IPMMU and the second cell the number of the micro-TLB that the
> > -device is connected to.
> 
> This definition of what the phandle argument means...
> 
> [...]
> > +  '#iommu-cells':
> > +const: 1
>  > +
> 
> ...deserves to be captured in a description here.

Thank you for the comment! I'll fix this.

Best regards,
Yoshihiro Shimoda

> Robin.
___
iommu mailing list
iommu@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/iommu


RE: [PATCH v1 2/2] vfio/pci: Emulate PASID/PRI capability for VFs

2020-04-14 Thread Tian, Kevin
> From: Alex Williamson 
> Sent: Wednesday, April 15, 2020 8:36 AM
> 
> On Tue, 14 Apr 2020 23:57:33 +
> "Tian, Kevin"  wrote:
> 
> > > From: Alex Williamson 
> > > Sent: Tuesday, April 14, 2020 11:24 PM
> > >
> > > On Tue, 14 Apr 2020 03:42:42 +
> > > "Tian, Kevin"  wrote:
> > >
> > > > > From: Alex Williamson 
> > > > > Sent: Tuesday, April 14, 2020 11:29 AM
> > > > >
> > > > > On Tue, 14 Apr 2020 02:40:58 +
> > > > > "Tian, Kevin"  wrote:
> > > > >
> > > > > > > From: Alex Williamson 
> > > > > > > Sent: Tuesday, April 14, 2020 3:21 AM
> > > > > > >
> > > > > > > On Mon, 13 Apr 2020 08:05:33 +
> > > > > > > "Tian, Kevin"  wrote:
> > > > > > >
> > > > > > > > > From: Tian, Kevin
> > > > > > > > > Sent: Monday, April 13, 2020 3:55 PM
> > > > > > > > >
> > > > > > > > > > From: Raj, Ashok 
> > > > > > > > > > Sent: Monday, April 13, 2020 11:11 AM
> > > > > > > > > >
> > > > > > > > > > On Wed, Apr 08, 2020 at 10:19:40AM -0600, Alex Williamson
> > > wrote:
> > > > > > > > > > > On Tue, 7 Apr 2020 21:00:21 -0700
> > > > > > > > > > > "Raj, Ashok"  wrote:
> > > > > > > > > > >
> > > > > > > > > > > > Hi Alex
> > > > > > > > > > > >
> > > > > > > > > > > > + Bjorn
> > > > > > > > > > >
> > > > > > > > > > >  + Don
> > > > > > > > > > >
> > > > > > > > > > > > FWIW I can't understand why PCI SIG went different ways
> > > with
> > > > > ATS,
> > > > > > > > > > > > where its enumerated on PF and VF. But for PASID and
> PRI its
> > > > > only
> > > > > > > > > > > > in PF.
> > > > > > > > > > > >
> > > > > > > > > > > > I'm checking with our internal SIG reps to followup on
> that.
> > > > > > > > > > > >
> > > > > > > > > > > > On Tue, Apr 07, 2020 at 09:58:01AM -0600, Alex
> Williamson
> > > > > wrote:
> > > > > > > > > > > > > > Is there vendor guarantee that hidden registers will
> locate
> > > at
> > > > > the
> > > > > > > > > > > > > > same offset between PF and VF config space?
> > > > > > > > > > > > >
> > > > > > > > > > > > > I'm not sure if the spec really precludes hidden 
> > > > > > > > > > > > > registers,
> > > but
> > > > > the
> > > > > > > > > > > > > fact that these registers are explicitly outside of 
> > > > > > > > > > > > > the
> > > capability
> > > > > > > > > > > > > chain implies they're only intended for device 
> > > > > > > > > > > > > specific
> use,
> > > so
> > > > > I'd
> > > > > > > say
> > > > > > > > > > > > > there are no guarantees about anything related to 
> > > > > > > > > > > > > these
> > > > > registers.
> > > > > > > > > > > >
> > > > > > > > > > > > As you had suggested in the other thread, we could
> consider
> > > > > > > > > > > > using the same offset as in PF, but even that's a better
> guess
> > > > > > > > > > > > still not reliable.
> > > > > > > > > > > >
> > > > > > > > > > > > The other option is to maybe extend driver ops in the PF
> to
> > > > > expose
> > > > > > > > > > > > where the offsets should be. Sort of adding the quirk in
> the
> > > > > > > > > > > > implementation.
> > > > > > > > > > > >
> > > > > > > > > > > > I'm not sure how prevalent are PASID and PRI in VF
> devices. If
> > > > > SIG is
> > > > > > > > > > resisting
> > > > > > > > > > > > making VF's first class citizen, we might ask them to 
> > > > > > > > > > > > add
> > > some
> > > > > > > verbiage
> > > > > > > > > > > > to suggest leave the same offsets as PF open to help
> > > emulation
> > > > > > > software.
> > > > > > > > > > >
> > > > > > > > > > > Even if we know where to expose these capabilities on the
> VF,
> > > it's
> > > > > not
> > > > > > > > > > > clear to me how we can actually virtualize the capability
> itself.
> > > If
> > > > > > > > > > > the spec defines, for example, an enable bit as r/w then
> > > software
> > > > > that
> > > > > > > > > > > interacts with that register expects the bit is settable.
> There's
> > > no
> > > > > > > > > > > protocol for "try to set the bit and re-read it to see if 
> > > > > > > > > > > the
> > > hardware
> > > > > > > > > > > accepted it".  Therefore a capability with a fixed enable 
> > > > > > > > > > > bit
> > > > > > > > > > > representing the state of the PF, not settable by the VF, 
> > > > > > > > > > > is
> > > > > > > > > > > disingenuous to the spec.
> > > > > > > > > >
> > > > > > > > > > I think we are all in violent agreement. A lot of times the 
> > > > > > > > > > pci
> spec
> > > > > gets
> > > > > > > > > > defined several years ahead of real products and no one
> > > > > remembers
> > > > > > > > > > the justification on why they restricted things the way they
> did.
> > > > > > > > > >
> > > > > > > > > > Maybe someone early product wasn't quite exposing these
> > > features
> > > > > to
> > > > > > > the
> > > > > > > > > > VF
> > > > > > > > > > and hence the spec is bug compatible :-)
> > > > > > > > > >
> > > > > > > > > > >
> > > > > > > > > > > If what we're trying to do is expose that PASID and PRI 
> > > > > > > > > > > are
> > > enabled
> > > > > on
> > > > > > > > > > > the 

Re: [PATCH v1 2/2] vfio/pci: Emulate PASID/PRI capability for VFs

2020-04-14 Thread Alex Williamson
On Tue, 14 Apr 2020 23:57:33 +
"Tian, Kevin"  wrote:

> > From: Alex Williamson 
> > Sent: Tuesday, April 14, 2020 11:24 PM
> > 
> > On Tue, 14 Apr 2020 03:42:42 +
> > "Tian, Kevin"  wrote:
> >   
> > > > From: Alex Williamson 
> > > > Sent: Tuesday, April 14, 2020 11:29 AM
> > > >
> > > > On Tue, 14 Apr 2020 02:40:58 +
> > > > "Tian, Kevin"  wrote:
> > > >  
> > > > > > From: Alex Williamson 
> > > > > > Sent: Tuesday, April 14, 2020 3:21 AM
> > > > > >
> > > > > > On Mon, 13 Apr 2020 08:05:33 +
> > > > > > "Tian, Kevin"  wrote:
> > > > > >  
> > > > > > > > From: Tian, Kevin
> > > > > > > > Sent: Monday, April 13, 2020 3:55 PM
> > > > > > > >  
> > > > > > > > > From: Raj, Ashok 
> > > > > > > > > Sent: Monday, April 13, 2020 11:11 AM
> > > > > > > > >
> > > > > > > > > On Wed, Apr 08, 2020 at 10:19:40AM -0600, Alex Williamson  
> > wrote:  
> > > > > > > > > > On Tue, 7 Apr 2020 21:00:21 -0700
> > > > > > > > > > "Raj, Ashok"  wrote:
> > > > > > > > > >  
> > > > > > > > > > > Hi Alex
> > > > > > > > > > >
> > > > > > > > > > > + Bjorn  
> > > > > > > > > >
> > > > > > > > > >  + Don
> > > > > > > > > >  
> > > > > > > > > > > FWIW I can't understand why PCI SIG went different ways  
> > with  
> > > > ATS,  
> > > > > > > > > > > where its enumerated on PF and VF. But for PASID and PRI 
> > > > > > > > > > > its  
> > > > only  
> > > > > > > > > > > in PF.
> > > > > > > > > > >
> > > > > > > > > > > I'm checking with our internal SIG reps to followup on 
> > > > > > > > > > > that.
> > > > > > > > > > >
> > > > > > > > > > > On Tue, Apr 07, 2020 at 09:58:01AM -0600, Alex Williamson 
> > > > > > > > > > >  
> > > > wrote:  
> > > > > > > > > > > > > Is there vendor guarantee that hidden registers will 
> > > > > > > > > > > > > locate  
> > at  
> > > > the  
> > > > > > > > > > > > > same offset between PF and VF config space?  
> > > > > > > > > > > >
> > > > > > > > > > > > I'm not sure if the spec really precludes hidden 
> > > > > > > > > > > > registers,  
> > but  
> > > > the  
> > > > > > > > > > > > fact that these registers are explicitly outside of the 
> > > > > > > > > > > >  
> > capability  
> > > > > > > > > > > > chain implies they're only intended for device specific 
> > > > > > > > > > > > use,  
> > so  
> > > > I'd  
> > > > > > say  
> > > > > > > > > > > > there are no guarantees about anything related to these 
> > > > > > > > > > > >  
> > > > registers.  
> > > > > > > > > > >
> > > > > > > > > > > As you had suggested in the other thread, we could 
> > > > > > > > > > > consider
> > > > > > > > > > > using the same offset as in PF, but even that's a better 
> > > > > > > > > > > guess
> > > > > > > > > > > still not reliable.
> > > > > > > > > > >
> > > > > > > > > > > The other option is to maybe extend driver ops in the PF 
> > > > > > > > > > > to  
> > > > expose  
> > > > > > > > > > > where the offsets should be. Sort of adding the quirk in 
> > > > > > > > > > > the
> > > > > > > > > > > implementation.
> > > > > > > > > > >
> > > > > > > > > > > I'm not sure how prevalent are PASID and PRI in VF 
> > > > > > > > > > > devices. If  
> > > > SIG is  
> > > > > > > > > resisting  
> > > > > > > > > > > making VF's first class citizen, we might ask them to add 
> > > > > > > > > > >  
> > some  
> > > > > > verbiage  
> > > > > > > > > > > to suggest leave the same offsets as PF open to help  
> > emulation  
> > > > > > software.  
> > > > > > > > > >
> > > > > > > > > > Even if we know where to expose these capabilities on the 
> > > > > > > > > > VF,  
> > it's  
> > > > not  
> > > > > > > > > > clear to me how we can actually virtualize the capability 
> > > > > > > > > > itself.  
> > If  
> > > > > > > > > > the spec defines, for example, an enable bit as r/w then  
> > software  
> > > > that  
> > > > > > > > > > interacts with that register expects the bit is settable.  
> > > > > > > > > > There's  
> > no  
> > > > > > > > > > protocol for "try to set the bit and re-read it to see if 
> > > > > > > > > > the  
> > hardware  
> > > > > > > > > > accepted it".  Therefore a capability with a fixed enable 
> > > > > > > > > > bit
> > > > > > > > > > representing the state of the PF, not settable by the VF, is
> > > > > > > > > > disingenuous to the spec.  
> > > > > > > > >
> > > > > > > > > I think we are all in violent agreement. A lot of times the 
> > > > > > > > > pci spec  
> > > > gets  
> > > > > > > > > defined several years ahead of real products and no one  
> > > > remembers  
> > > > > > > > > the justification on why they restricted things the way they 
> > > > > > > > > did.
> > > > > > > > >
> > > > > > > > > Maybe someone early product wasn't quite exposing these  
> > features  
> > > > to  
> > > > > > the  
> > > > > > > > > VF
> > > > > > > > > and hence the spec is bug compatible :-)
> > > > > > > > >  
> > > > > > > > > >
> > > > > > > > > > If what we're trying to do is expose that PASID and PRI are 
> > > > > > > > > >  

[patch 4/7] dma-direct: atomic allocations must come from atomic coherent pools

2020-04-14 Thread David Rientjes via iommu
When a device requires unencrypted memory and the context does not allow
blocking, memory must be returned from the atomic coherent pools.

This avoids the remap when CONFIG_DMA_DIRECT_REMAP is not enabled and the
config only requires CONFIG_DMA_COHERENT_POOL.  This will be used for
CONFIG_AMD_MEM_ENCRYPT in a subsequent patch.

Keep all memory in these pools unencrypted.  When set_memory_decrypted()
fails, this prohibits the memory from being added.  If adding memory to
the genpool fails, and set_memory_encrypted() subsequently fails, there
is no alternative other than leaking the memory.

Signed-off-by: David Rientjes 
---
 kernel/dma/direct.c | 46 ++---
 kernel/dma/pool.c   | 27 +++---
 2 files changed, 63 insertions(+), 10 deletions(-)

diff --git a/kernel/dma/direct.c b/kernel/dma/direct.c
index a834ee22f8ff..07ecc5c4d134 100644
--- a/kernel/dma/direct.c
+++ b/kernel/dma/direct.c
@@ -76,6 +76,39 @@ static bool dma_coherent_ok(struct device *dev, phys_addr_t 
phys, size_t size)
min_not_zero(dev->coherent_dma_mask, 
dev->bus_dma_limit);
 }
 
+/*
+ * Decrypting memory is allowed to block, so if this device requires
+ * unencrypted memory it must come from atomic pools.
+ */
+static inline bool dma_should_alloc_from_pool(struct device *dev, gfp_t gfp,
+ unsigned long attrs)
+{
+   if (!IS_ENABLED(CONFIG_DMA_COHERENT_POOL))
+   return false;
+   if (gfpflags_allow_blocking(gfp))
+   return false;
+   if (force_dma_unencrypted(dev))
+   return true;
+   if (!IS_ENABLED(CONFIG_DMA_DIRECT_REMAP))
+   return false;
+   if (dma_alloc_need_uncached(dev, attrs))
+   return true;
+   return false;
+}
+
+static inline bool dma_should_free_from_pool(struct device *dev,
+unsigned long attrs)
+{
+   if (IS_ENABLED(CONFIG_DMA_COHERENT_POOL))
+   return true;
+   if ((attrs & DMA_ATTR_NO_KERNEL_MAPPING) &&
+   !force_dma_unencrypted(dev))
+   return false;
+   if (IS_ENABLED(CONFIG_DMA_DIRECT_REMAP))
+   return true;
+   return false;
+}
+
 struct page *__dma_direct_alloc_pages(struct device *dev, size_t size,
gfp_t gfp, unsigned long attrs)
 {
@@ -125,9 +158,7 @@ void *dma_direct_alloc_pages(struct device *dev, size_t 
size,
struct page *page;
void *ret;
 
-   if (IS_ENABLED(CONFIG_DMA_DIRECT_REMAP) &&
-   dma_alloc_need_uncached(dev, attrs) &&
-   !gfpflags_allow_blocking(gfp)) {
+   if (dma_should_alloc_from_pool(dev, gfp, attrs)) {
ret = dma_alloc_from_pool(dev, PAGE_ALIGN(size), , gfp);
if (!ret)
return NULL;
@@ -204,6 +235,11 @@ void dma_direct_free_pages(struct device *dev, size_t 
size, void *cpu_addr,
 {
unsigned int page_order = get_order(size);
 
+   /* If cpu_addr is not from an atomic pool, dma_free_from_pool() fails */
+   if (dma_should_free_from_pool(dev, attrs) &&
+   dma_free_from_pool(dev, cpu_addr, PAGE_ALIGN(size)))
+   return;
+
if ((attrs & DMA_ATTR_NO_KERNEL_MAPPING) &&
!force_dma_unencrypted(dev)) {
/* cpu_addr is a struct page cookie, not a kernel address */
@@ -211,10 +247,6 @@ void dma_direct_free_pages(struct device *dev, size_t 
size, void *cpu_addr,
return;
}
 
-   if (IS_ENABLED(CONFIG_DMA_DIRECT_REMAP) &&
-   dma_free_from_pool(dev, cpu_addr, PAGE_ALIGN(size)))
-   return;
-
if (force_dma_unencrypted(dev))
set_memory_encrypted((unsigned long)cpu_addr, 1 << page_order);
 
diff --git a/kernel/dma/pool.c b/kernel/dma/pool.c
index 9e2da17ed17b..cf052314d9e4 100644
--- a/kernel/dma/pool.c
+++ b/kernel/dma/pool.c
@@ -7,6 +7,7 @@
 #include 
 #include 
 #include 
+#include 
 #include 
 #include 
 #include 
@@ -53,22 +54,42 @@ static int atomic_pool_expand(struct gen_pool *pool, size_t 
pool_size,
 
arch_dma_prep_coherent(page, pool_size);
 
+#ifdef CONFIG_DMA_DIRECT_REMAP
addr = dma_common_contiguous_remap(page, pool_size,
   pgprot_dmacoherent(PAGE_KERNEL),
   __builtin_return_address(0));
if (!addr)
goto free_page;
-
+#else
+   addr = page_to_virt(page);
+#endif
+   /*
+* Memory in the atomic DMA pools must be unencrypted, the pools do not
+* shrink so no re-encryption occurs in dma_direct_free_pages().
+*/
+   ret = set_memory_decrypted((unsigned long)page_to_virt(page),
+  1 << order);
+   if (ret)
+   goto remove_mapping;
ret = gen_pool_add_virt(pool, (unsigned long)addr, page_to_phys(page),
  

[patch 1/7] dma-remap: separate DMA atomic pools from direct remap code

2020-04-14 Thread David Rientjes via iommu
DMA atomic pools will be needed beyond only CONFIG_DMA_DIRECT_REMAP so
separate them out into their own file.

This also adds a new Kconfig option that can be subsequently used for
options, such as CONFIG_AMD_MEM_ENCRYPT, that will utilize the coherent
pools but do not have a dependency on direct remapping.

For this patch alone, there is no functional change introduced.

Reviewed-by: Christoph Hellwig 
Signed-off-by: David Rientjes 
---
 kernel/dma/Kconfig  |   6 ++-
 kernel/dma/Makefile |   1 +
 kernel/dma/pool.c   | 123 
 kernel/dma/remap.c  | 114 
 4 files changed, 129 insertions(+), 115 deletions(-)
 create mode 100644 kernel/dma/pool.c

diff --git a/kernel/dma/Kconfig b/kernel/dma/Kconfig
index 4c103a24e380..d006668c0027 100644
--- a/kernel/dma/Kconfig
+++ b/kernel/dma/Kconfig
@@ -79,10 +79,14 @@ config DMA_REMAP
select DMA_NONCOHERENT_MMAP
bool
 
-config DMA_DIRECT_REMAP
+config DMA_COHERENT_POOL
bool
select DMA_REMAP
 
+config DMA_DIRECT_REMAP
+   bool
+   select DMA_COHERENT_POOL
+
 config DMA_CMA
bool "DMA Contiguous Memory Allocator"
depends on HAVE_DMA_CONTIGUOUS && CMA
diff --git a/kernel/dma/Makefile b/kernel/dma/Makefile
index d237cf3dc181..370f63344e9c 100644
--- a/kernel/dma/Makefile
+++ b/kernel/dma/Makefile
@@ -6,4 +6,5 @@ obj-$(CONFIG_DMA_DECLARE_COHERENT)  += coherent.o
 obj-$(CONFIG_DMA_VIRT_OPS) += virt.o
 obj-$(CONFIG_DMA_API_DEBUG)+= debug.o
 obj-$(CONFIG_SWIOTLB)  += swiotlb.o
+obj-$(CONFIG_DMA_COHERENT_POOL)+= pool.o
 obj-$(CONFIG_DMA_REMAP)+= remap.o
diff --git a/kernel/dma/pool.c b/kernel/dma/pool.c
new file mode 100644
index ..6612c2d51d3c
--- /dev/null
+++ b/kernel/dma/pool.c
@@ -0,0 +1,123 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Copyright (C) 2020 Google LLC
+ */
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+
+static struct gen_pool *atomic_pool __ro_after_init;
+
+#define DEFAULT_DMA_COHERENT_POOL_SIZE  SZ_256K
+static size_t atomic_pool_size __initdata = DEFAULT_DMA_COHERENT_POOL_SIZE;
+
+static int __init early_coherent_pool(char *p)
+{
+   atomic_pool_size = memparse(p, );
+   return 0;
+}
+early_param("coherent_pool", early_coherent_pool);
+
+static gfp_t dma_atomic_pool_gfp(void)
+{
+   if (IS_ENABLED(CONFIG_ZONE_DMA))
+   return GFP_DMA;
+   if (IS_ENABLED(CONFIG_ZONE_DMA32))
+   return GFP_DMA32;
+   return GFP_KERNEL;
+}
+
+static int __init dma_atomic_pool_init(void)
+{
+   unsigned int pool_size_order = get_order(atomic_pool_size);
+   unsigned long nr_pages = atomic_pool_size >> PAGE_SHIFT;
+   struct page *page;
+   void *addr;
+   int ret;
+
+   if (dev_get_cma_area(NULL))
+   page = dma_alloc_from_contiguous(NULL, nr_pages,
+pool_size_order, false);
+   else
+   page = alloc_pages(dma_atomic_pool_gfp(), pool_size_order);
+   if (!page)
+   goto out;
+
+   arch_dma_prep_coherent(page, atomic_pool_size);
+
+   atomic_pool = gen_pool_create(PAGE_SHIFT, -1);
+   if (!atomic_pool)
+   goto free_page;
+
+   addr = dma_common_contiguous_remap(page, atomic_pool_size,
+  pgprot_dmacoherent(PAGE_KERNEL),
+  __builtin_return_address(0));
+   if (!addr)
+   goto destroy_genpool;
+
+   ret = gen_pool_add_virt(atomic_pool, (unsigned long)addr,
+   page_to_phys(page), atomic_pool_size, -1);
+   if (ret)
+   goto remove_mapping;
+   gen_pool_set_algo(atomic_pool, gen_pool_first_fit_order_align, NULL);
+
+   pr_info("DMA: preallocated %zu KiB pool for atomic allocations\n",
+   atomic_pool_size / 1024);
+   return 0;
+
+remove_mapping:
+   dma_common_free_remap(addr, atomic_pool_size);
+destroy_genpool:
+   gen_pool_destroy(atomic_pool);
+   atomic_pool = NULL;
+free_page:
+   if (!dma_release_from_contiguous(NULL, page, nr_pages))
+   __free_pages(page, pool_size_order);
+out:
+   pr_err("DMA: failed to allocate %zu KiB pool for atomic coherent 
allocation\n",
+   atomic_pool_size / 1024);
+   return -ENOMEM;
+}
+postcore_initcall(dma_atomic_pool_init);
+
+bool dma_in_atomic_pool(void *start, size_t size)
+{
+   if (unlikely(!atomic_pool))
+   return false;
+
+   return gen_pool_has_addr(atomic_pool, (unsigned long)start, size);
+}
+
+void *dma_alloc_from_pool(size_t size, struct page **ret_page, gfp_t flags)
+{
+   unsigned long val;
+   void *ptr = NULL;
+
+   if (!atomic_pool) {
+   WARN(1, "coherent pool not initialised!\n");
+   return NULL;
+  

[patch 3/7] dma-pool: dynamically expanding atomic pools

2020-04-14 Thread David Rientjes via iommu
When an atomic pool becomes fully depleted because it is now relied upon
for all non-blocking allocations through the DMA API, allow background
expansion of each pool by a kworker.

When an atomic pool has less than the default size of memory left, kick
off a kworker to dynamically expand the pool in the background.  The pool
is doubled in size, up to MAX_ORDER-1.  If memory cannot be allocated at
the requested order, smaller allocation(s) are attempted.

This allows the default size to be kept quite low when one or more of the
atomic pools is not used.

Allocations for lowmem should also use GFP_KERNEL for the benefits of
reclaim, so use GFP_KERNEL | GFP_DMA and GFP_KERNEL | GFP_DMA32 for
lowmem allocations.

This also allows __dma_atomic_pool_init() to return a pointer to the pool
to make initialization cleaner.

Also switch over some node ids to the more appropriate NUMA_NO_NODE.

Signed-off-by: David Rientjes 
---
 kernel/dma/pool.c | 122 +++---
 1 file changed, 84 insertions(+), 38 deletions(-)

diff --git a/kernel/dma/pool.c b/kernel/dma/pool.c
index 5c98ab991b16..9e2da17ed17b 100644
--- a/kernel/dma/pool.c
+++ b/kernel/dma/pool.c
@@ -9,13 +9,17 @@
 #include 
 #include 
 #include 
+#include 
 
 static struct gen_pool *atomic_pool_dma __ro_after_init;
 static struct gen_pool *atomic_pool_dma32 __ro_after_init;
 static struct gen_pool *atomic_pool_kernel __ro_after_init;
 
 #define DEFAULT_DMA_COHERENT_POOL_SIZE  SZ_256K
-static size_t atomic_pool_size __initdata = DEFAULT_DMA_COHERENT_POOL_SIZE;
+static size_t atomic_pool_size = DEFAULT_DMA_COHERENT_POOL_SIZE;
+
+/* Dynamic background expansion when the atomic pool is near capacity */
+static struct work_struct atomic_pool_work;
 
 static int __init early_coherent_pool(char *p)
 {
@@ -24,76 +28,116 @@ static int __init early_coherent_pool(char *p)
 }
 early_param("coherent_pool", early_coherent_pool);
 
-static int __init __dma_atomic_pool_init(struct gen_pool **pool,
-size_t pool_size, gfp_t gfp)
+static int atomic_pool_expand(struct gen_pool *pool, size_t pool_size,
+ gfp_t gfp)
 {
-   const unsigned int order = get_order(pool_size);
-   const unsigned long nr_pages = pool_size >> PAGE_SHIFT;
+   unsigned int order;
struct page *page;
void *addr;
-   int ret;
+   int ret = -ENOMEM;
+
+   /* Cannot allocate larger than MAX_ORDER-1 */
+   order = min(get_order(pool_size), MAX_ORDER-1);
+
+   do {
+   pool_size = 1 << (PAGE_SHIFT + order);
 
-   if (dev_get_cma_area(NULL))
-   page = dma_alloc_from_contiguous(NULL, nr_pages, order, false);
-   else
-   page = alloc_pages(gfp, order);
+   if (dev_get_cma_area(NULL))
+   page = dma_alloc_from_contiguous(NULL, 1 << order,
+order, false);
+   else
+   page = alloc_pages(gfp, order);
+   } while (!page && order-- > 0);
if (!page)
goto out;
 
arch_dma_prep_coherent(page, pool_size);
 
-   *pool = gen_pool_create(PAGE_SHIFT, -1);
-   if (!*pool)
-   goto free_page;
-
addr = dma_common_contiguous_remap(page, pool_size,
   pgprot_dmacoherent(PAGE_KERNEL),
   __builtin_return_address(0));
if (!addr)
-   goto destroy_genpool;
+   goto free_page;
 
-   ret = gen_pool_add_virt(*pool, (unsigned long)addr, page_to_phys(page),
-   pool_size, -1);
+   ret = gen_pool_add_virt(pool, (unsigned long)addr, page_to_phys(page),
+   pool_size, NUMA_NO_NODE);
if (ret)
goto remove_mapping;
-   gen_pool_set_algo(*pool, gen_pool_first_fit_order_align, NULL);
 
-   pr_info("DMA: preallocated %zu KiB %pGg pool for atomic allocations\n",
-   pool_size >> 10, );
return 0;
 
 remove_mapping:
dma_common_free_remap(addr, pool_size);
-destroy_genpool:
-   gen_pool_destroy(*pool);
-   *pool = NULL;
 free_page:
-   if (!dma_release_from_contiguous(NULL, page, nr_pages))
+   if (!dma_release_from_contiguous(NULL, page, 1 << order))
__free_pages(page, order);
 out:
-   pr_err("DMA: failed to allocate %zu KiB %pGg pool for atomic 
allocation\n",
-  pool_size >> 10, );
-   return -ENOMEM;
+   return ret;
+}
+
+static void atomic_pool_resize(struct gen_pool *pool, gfp_t gfp)
+{
+   if (pool && gen_pool_avail(pool) < atomic_pool_size)
+   atomic_pool_expand(pool, gen_pool_size(pool), gfp);
+}
+
+static void atomic_pool_work_fn(struct work_struct *work)
+{
+   if (IS_ENABLED(CONFIG_ZONE_DMA))
+   atomic_pool_resize(atomic_pool_dma,
+

[patch 6/7] x86/mm: unencrypted non-blocking DMA allocations use coherent pools

2020-04-14 Thread David Rientjes via iommu
When CONFIG_AMD_MEM_ENCRYPT is enabled and a device requires unencrypted
DMA, all non-blocking allocations must originate from the atomic DMA
coherent pools.

Select CONFIG_DMA_COHERENT_POOL for CONFIG_AMD_MEM_ENCRYPT.

Signed-off-by: David Rientjes 
---
 arch/x86/Kconfig | 1 +
 1 file changed, 1 insertion(+)

diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig
index 1d6104ea8af0..2bf819d3 100644
--- a/arch/x86/Kconfig
+++ b/arch/x86/Kconfig
@@ -1520,6 +1520,7 @@ config X86_CPA_STATISTICS
 config AMD_MEM_ENCRYPT
bool "AMD Secure Memory Encryption (SME) support"
depends on X86_64 && CPU_SUP_AMD
+   select DMA_COHERENT_POOL
select DYNAMIC_PHYSICAL_MASK
select ARCH_USE_MEMREMAP_PROT
select ARCH_HAS_FORCE_DMA_UNENCRYPTED
___
iommu mailing list
iommu@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/iommu


[patch 0/7] unencrypted atomic DMA pools with dynamic expansion

2020-04-14 Thread David Rientjes via iommu
set_memory_decrypted() may block so it is not possible to do non-blocking
allocations through the DMA API for devices that required unencrypted
memory.

The solution is to expand the atomic DMA pools for the various possible
gfp requirements as a means to prevent an unnecessary depletion of lowmem.
These atomic pools are separated from the remap code and can be selected
for configurations that need them outside the scope of
CONFIG_DMA_DIRECT_REMAP, such as CONFIG_AMD_MEM_ENCRYPT.

These atomic DMA pools are kept unencrypted so they can immediately be
used for non-blocking allocations.  Since the need for this type of memory
depends on the kernel config and devices being used, these pools are also
dynamically expandable.

The sizes of the various atomic DMA pools is exported through debugfs at
/sys/kernel/debug/dma_pools.

This patchset is based on latest Linus HEAD:

commit 8632e9b5645bbc2331d21d892b0d6961c1a08429
Merge: 6cc9306b8fc0 f3a99e761efa
Author: Linus Torvalds 
Date:   Tue Apr 14 11:58:04 2020 -0700

Merge tag 'hyperv-fixes-signed' of 
git://git.kernel.org/pub/scm/linux/kernel/git/hyperv/linux
---
 arch/x86/Kconfig|   1 +
 drivers/iommu/dma-iommu.c   |   5 +-
 include/linux/dma-direct.h  |   2 +
 include/linux/dma-mapping.h |   6 +-
 kernel/dma/Kconfig  |   6 +-
 kernel/dma/Makefile |   1 +
 kernel/dma/direct.c |  56 ++--
 kernel/dma/pool.c   | 275 
 kernel/dma/remap.c  | 114 ---
 9 files changed, 334 insertions(+), 132 deletions(-)
 create mode 100644 kernel/dma/pool.c
___
iommu mailing list
iommu@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/iommu


[patch 7/7] dma-pool: scale the default DMA coherent pool size with memory capacity

2020-04-14 Thread David Rientjes via iommu
When AMD memory encryption is enabled, some devices may use more than
256KB/sec from the atomic pools.  It would be more appropriate to scale
the default size based on memory capacity unless the coherent_pool
option is used on the kernel command line.

This provides a slight optimization on initial expansion and is deemed
appropriate due to the increased reliance on the atomic pools.  Note that
the default size of 128KB per pool will normally be larger than the
single coherent pool implementation since there are now up to three
coherent pools (DMA, DMA32, and kernel).

Note that even prior to this patch, coherent_pool= for sizes larger than
1 << (PAGE_SHIFT + MAX_ORDER-1) can fail.  With new dynamic expansion
support, this would be trivially extensible to allow even larger initial
sizes.

Signed-off-by: David Rientjes 
---
 kernel/dma/pool.c | 14 --
 1 file changed, 12 insertions(+), 2 deletions(-)

diff --git a/kernel/dma/pool.c b/kernel/dma/pool.c
index 3e22022c933b..763b687569b0 100644
--- a/kernel/dma/pool.c
+++ b/kernel/dma/pool.c
@@ -22,8 +22,8 @@ static unsigned long pool_size_dma32;
 static unsigned long pool_size_kernel;
 #endif
 
-#define DEFAULT_DMA_COHERENT_POOL_SIZE  SZ_256K
-static size_t atomic_pool_size = DEFAULT_DMA_COHERENT_POOL_SIZE;
+/* Size can be defined by the coherent_pool command line */
+static size_t atomic_pool_size;
 
 /* Dynamic background expansion when the atomic pool is near capacity */
 static struct work_struct atomic_pool_work;
@@ -181,6 +181,16 @@ static int __init dma_atomic_pool_init(void)
 {
int ret = 0;
 
+   /*
+* If coherent_pool was not used on the command line, default the pool
+* sizes to 128KB per 1GB of memory, min 128KB, max MAX_ORDER-1.
+*/
+   if (!atomic_pool_size) {
+   atomic_pool_size = max(totalram_pages() >> PAGE_SHIFT, 1UL) *
+   SZ_128K;
+   atomic_pool_size = min_t(size_t, atomic_pool_size,
+1 << (PAGE_SHIFT + MAX_ORDER-1));
+   }
INIT_WORK(_pool_work, atomic_pool_work_fn);
 
atomic_pool_kernel = __dma_atomic_pool_init(atomic_pool_size,
___
iommu mailing list
iommu@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/iommu


[patch 2/7] dma-pool: add additional coherent pools to map to gfp mask

2020-04-14 Thread David Rientjes via iommu
The single atomic pool is allocated from the lowest zone possible since
it is guaranteed to be applicable for any DMA allocation.

Devices may allocate through the DMA API but not have a strict reliance
on GFP_DMA memory.  Since the atomic pool will be used for all
non-blockable allocations, returning all memory from ZONE_DMA may
unnecessarily deplete the zone.

Provision for multiple atomic pools that will map to the optimal gfp
mask of the device.

When allocating non-blockable memory, determine the optimal gfp mask of
the device and use the appropriate atomic pool.

The coherent DMA mask will remain the same between allocation and free
and, thus, memory will be freed to the same atomic pool it was allocated
from.

__dma_atomic_pool_init() will be changed to return struct gen_pool *
later once dynamic expansion is added.

Signed-off-by: David Rientjes 
---
 drivers/iommu/dma-iommu.c   |   5 +-
 include/linux/dma-direct.h  |   2 +
 include/linux/dma-mapping.h |   6 +-
 kernel/dma/direct.c |  12 ++--
 kernel/dma/pool.c   | 120 +++-
 5 files changed, 91 insertions(+), 54 deletions(-)

diff --git a/drivers/iommu/dma-iommu.c b/drivers/iommu/dma-iommu.c
index ba128d1cdaee..4959f5df21bd 100644
--- a/drivers/iommu/dma-iommu.c
+++ b/drivers/iommu/dma-iommu.c
@@ -952,7 +952,7 @@ static void __iommu_dma_free(struct device *dev, size_t 
size, void *cpu_addr)
 
/* Non-coherent atomic allocation? Easy */
if (IS_ENABLED(CONFIG_DMA_DIRECT_REMAP) &&
-   dma_free_from_pool(cpu_addr, alloc_size))
+   dma_free_from_pool(dev, cpu_addr, alloc_size))
return;
 
if (IS_ENABLED(CONFIG_DMA_REMAP) && is_vmalloc_addr(cpu_addr)) {
@@ -1035,7 +1035,8 @@ static void *iommu_dma_alloc(struct device *dev, size_t 
size,
 
if (IS_ENABLED(CONFIG_DMA_DIRECT_REMAP) &&
!gfpflags_allow_blocking(gfp) && !coherent)
-   cpu_addr = dma_alloc_from_pool(PAGE_ALIGN(size), , gfp);
+   cpu_addr = dma_alloc_from_pool(dev, PAGE_ALIGN(size), ,
+  gfp);
else
cpu_addr = iommu_dma_alloc_pages(dev, size, , gfp, attrs);
if (!cpu_addr)
diff --git a/include/linux/dma-direct.h b/include/linux/dma-direct.h
index 24b8684aa21d..136f984df0d9 100644
--- a/include/linux/dma-direct.h
+++ b/include/linux/dma-direct.h
@@ -67,6 +67,8 @@ static inline bool dma_capable(struct device *dev, dma_addr_t 
addr, size_t size,
 }
 
 u64 dma_direct_get_required_mask(struct device *dev);
+gfp_t dma_direct_optimal_gfp_mask(struct device *dev, u64 dma_mask,
+ u64 *phys_mask);
 void *dma_direct_alloc(struct device *dev, size_t size, dma_addr_t *dma_handle,
gfp_t gfp, unsigned long attrs);
 void dma_direct_free(struct device *dev, size_t size, void *cpu_addr,
diff --git a/include/linux/dma-mapping.h b/include/linux/dma-mapping.h
index 330ad58fbf4d..b43116a6405d 100644
--- a/include/linux/dma-mapping.h
+++ b/include/linux/dma-mapping.h
@@ -630,9 +630,9 @@ void *dma_common_pages_remap(struct page **pages, size_t 
size,
pgprot_t prot, const void *caller);
 void dma_common_free_remap(void *cpu_addr, size_t size);
 
-bool dma_in_atomic_pool(void *start, size_t size);
-void *dma_alloc_from_pool(size_t size, struct page **ret_page, gfp_t flags);
-bool dma_free_from_pool(void *start, size_t size);
+void *dma_alloc_from_pool(struct device *dev, size_t size,
+ struct page **ret_page, gfp_t flags);
+bool dma_free_from_pool(struct device *dev, void *start, size_t size);
 
 int
 dma_common_get_sgtable(struct device *dev, struct sg_table *sgt, void 
*cpu_addr,
diff --git a/kernel/dma/direct.c b/kernel/dma/direct.c
index 8f4bbdaf965e..a834ee22f8ff 100644
--- a/kernel/dma/direct.c
+++ b/kernel/dma/direct.c
@@ -45,8 +45,8 @@ u64 dma_direct_get_required_mask(struct device *dev)
return (1ULL << (fls64(max_dma) - 1)) * 2 - 1;
 }
 
-static gfp_t __dma_direct_optimal_gfp_mask(struct device *dev, u64 dma_mask,
-   u64 *phys_limit)
+gfp_t dma_direct_optimal_gfp_mask(struct device *dev, u64 dma_mask,
+ u64 *phys_limit)
 {
u64 dma_limit = min_not_zero(dma_mask, dev->bus_dma_limit);
 
@@ -89,8 +89,8 @@ struct page *__dma_direct_alloc_pages(struct device *dev, 
size_t size,
 
/* we always manually zero the memory once we are done: */
gfp &= ~__GFP_ZERO;
-   gfp |= __dma_direct_optimal_gfp_mask(dev, dev->coherent_dma_mask,
-   _limit);
+   gfp |= dma_direct_optimal_gfp_mask(dev, dev->coherent_dma_mask,
+  _limit);
page = dma_alloc_contiguous(dev, alloc_size, gfp);
if (page && !dma_coherent_ok(dev, page_to_phys(page), size)) {
dma_free_contiguous(dev, page, alloc_size);
@@ -128,7 +128,7 @@ void *dma_direct_alloc_pages(struct device 

[patch 5/7] dma-pool: add pool sizes to debugfs

2020-04-14 Thread David Rientjes via iommu
The atomic DMA pools can dynamically expand based on non-blocking
allocations that need to use it.

Export the sizes of each of these pools, in bytes, through debugfs for
measurement.

Suggested-by: Christoph Hellwig 
Signed-off-by: David Rientjes 
---
 kernel/dma/pool.c | 41 +
 1 file changed, 41 insertions(+)

diff --git a/kernel/dma/pool.c b/kernel/dma/pool.c
index cf052314d9e4..3e22022c933b 100644
--- a/kernel/dma/pool.c
+++ b/kernel/dma/pool.c
@@ -2,6 +2,7 @@
 /*
  * Copyright (C) 2020 Google LLC
  */
+#include 
 #include 
 #include 
 #include 
@@ -15,6 +16,11 @@
 static struct gen_pool *atomic_pool_dma __ro_after_init;
 static struct gen_pool *atomic_pool_dma32 __ro_after_init;
 static struct gen_pool *atomic_pool_kernel __ro_after_init;
+#ifdef CONFIG_DEBUG_FS
+static unsigned long pool_size_dma;
+static unsigned long pool_size_dma32;
+static unsigned long pool_size_kernel;
+#endif
 
 #define DEFAULT_DMA_COHERENT_POOL_SIZE  SZ_256K
 static size_t atomic_pool_size = DEFAULT_DMA_COHERENT_POOL_SIZE;
@@ -29,6 +35,38 @@ static int __init early_coherent_pool(char *p)
 }
 early_param("coherent_pool", early_coherent_pool);
 
+#ifdef CONFIG_DEBUG_FS
+static void __init dma_atomic_pool_debugfs_init(void)
+{
+   struct dentry *root;
+
+   root = debugfs_create_dir("dma_pools", NULL);
+   if (IS_ERR_OR_NULL(root))
+   return;
+
+   debugfs_create_ulong("pool_size_dma", 0400, root, _size_dma);
+   debugfs_create_ulong("pool_size_dma32", 0400, root, _size_dma32);
+   debugfs_create_ulong("pool_size_kernel", 0400, root, _size_kernel);
+}
+
+static void dma_atomic_pool_size_add(gfp_t gfp, size_t size)
+{
+   if (gfp & __GFP_DMA)
+   pool_size_dma += size;
+   else if (gfp & __GFP_DMA32)
+   pool_size_dma32 += size;
+   else
+   pool_size_kernel += size;
+}
+#else
+static inline void dma_atomic_pool_debugfs_init(void)
+{
+}
+static inline void dma_atomic_pool_size_add(gfp_t gfp, size_t size)
+{
+}
+#endif /* CONFIG_DEBUG_FS */
+
 static int atomic_pool_expand(struct gen_pool *pool, size_t pool_size,
  gfp_t gfp)
 {
@@ -76,6 +114,7 @@ static int atomic_pool_expand(struct gen_pool *pool, size_t 
pool_size,
if (ret)
goto encrypt_mapping;
 
+   dma_atomic_pool_size_add(gfp, pool_size);
return 0;
 
 encrypt_mapping:
@@ -160,6 +199,8 @@ static int __init dma_atomic_pool_init(void)
if (!atomic_pool_dma32)
ret = -ENOMEM;
}
+
+   dma_atomic_pool_debugfs_init();
return ret;
 }
 postcore_initcall(dma_atomic_pool_init);
___
iommu mailing list
iommu@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/iommu


RE: [PATCH v1 2/2] vfio/pci: Emulate PASID/PRI capability for VFs

2020-04-14 Thread Tian, Kevin
> From: Alex Williamson 
> Sent: Tuesday, April 14, 2020 11:24 PM
> 
> On Tue, 14 Apr 2020 03:42:42 +
> "Tian, Kevin"  wrote:
> 
> > > From: Alex Williamson 
> > > Sent: Tuesday, April 14, 2020 11:29 AM
> > >
> > > On Tue, 14 Apr 2020 02:40:58 +
> > > "Tian, Kevin"  wrote:
> > >
> > > > > From: Alex Williamson 
> > > > > Sent: Tuesday, April 14, 2020 3:21 AM
> > > > >
> > > > > On Mon, 13 Apr 2020 08:05:33 +
> > > > > "Tian, Kevin"  wrote:
> > > > >
> > > > > > > From: Tian, Kevin
> > > > > > > Sent: Monday, April 13, 2020 3:55 PM
> > > > > > >
> > > > > > > > From: Raj, Ashok 
> > > > > > > > Sent: Monday, April 13, 2020 11:11 AM
> > > > > > > >
> > > > > > > > On Wed, Apr 08, 2020 at 10:19:40AM -0600, Alex Williamson
> wrote:
> > > > > > > > > On Tue, 7 Apr 2020 21:00:21 -0700
> > > > > > > > > "Raj, Ashok"  wrote:
> > > > > > > > >
> > > > > > > > > > Hi Alex
> > > > > > > > > >
> > > > > > > > > > + Bjorn
> > > > > > > > >
> > > > > > > > >  + Don
> > > > > > > > >
> > > > > > > > > > FWIW I can't understand why PCI SIG went different ways
> with
> > > ATS,
> > > > > > > > > > where its enumerated on PF and VF. But for PASID and PRI its
> > > only
> > > > > > > > > > in PF.
> > > > > > > > > >
> > > > > > > > > > I'm checking with our internal SIG reps to followup on that.
> > > > > > > > > >
> > > > > > > > > > On Tue, Apr 07, 2020 at 09:58:01AM -0600, Alex Williamson
> > > wrote:
> > > > > > > > > > > > Is there vendor guarantee that hidden registers will 
> > > > > > > > > > > > locate
> at
> > > the
> > > > > > > > > > > > same offset between PF and VF config space?
> > > > > > > > > > >
> > > > > > > > > > > I'm not sure if the spec really precludes hidden 
> > > > > > > > > > > registers,
> but
> > > the
> > > > > > > > > > > fact that these registers are explicitly outside of the
> capability
> > > > > > > > > > > chain implies they're only intended for device specific 
> > > > > > > > > > > use,
> so
> > > I'd
> > > > > say
> > > > > > > > > > > there are no guarantees about anything related to these
> > > registers.
> > > > > > > > > >
> > > > > > > > > > As you had suggested in the other thread, we could consider
> > > > > > > > > > using the same offset as in PF, but even that's a better 
> > > > > > > > > > guess
> > > > > > > > > > still not reliable.
> > > > > > > > > >
> > > > > > > > > > The other option is to maybe extend driver ops in the PF to
> > > expose
> > > > > > > > > > where the offsets should be. Sort of adding the quirk in the
> > > > > > > > > > implementation.
> > > > > > > > > >
> > > > > > > > > > I'm not sure how prevalent are PASID and PRI in VF devices. 
> > > > > > > > > > If
> > > SIG is
> > > > > > > > resisting
> > > > > > > > > > making VF's first class citizen, we might ask them to add
> some
> > > > > verbiage
> > > > > > > > > > to suggest leave the same offsets as PF open to help
> emulation
> > > > > software.
> > > > > > > > >
> > > > > > > > > Even if we know where to expose these capabilities on the VF,
> it's
> > > not
> > > > > > > > > clear to me how we can actually virtualize the capability 
> > > > > > > > > itself.
> If
> > > > > > > > > the spec defines, for example, an enable bit as r/w then
> software
> > > that
> > > > > > > > > interacts with that register expects the bit is settable.  
> > > > > > > > > There's
> no
> > > > > > > > > protocol for "try to set the bit and re-read it to see if the
> hardware
> > > > > > > > > accepted it".  Therefore a capability with a fixed enable bit
> > > > > > > > > representing the state of the PF, not settable by the VF, is
> > > > > > > > > disingenuous to the spec.
> > > > > > > >
> > > > > > > > I think we are all in violent agreement. A lot of times the pci 
> > > > > > > > spec
> > > gets
> > > > > > > > defined several years ahead of real products and no one
> > > remembers
> > > > > > > > the justification on why they restricted things the way they 
> > > > > > > > did.
> > > > > > > >
> > > > > > > > Maybe someone early product wasn't quite exposing these
> features
> > > to
> > > > > the
> > > > > > > > VF
> > > > > > > > and hence the spec is bug compatible :-)
> > > > > > > >
> > > > > > > > >
> > > > > > > > > If what we're trying to do is expose that PASID and PRI are
> enabled
> > > on
> > > > > > > > > the PF to a VF driver, maybe duplicating the PF capabilities 
> > > > > > > > > on
> the
> > > VF
> > > > > > > > > without the ability to control it is not the right approach.
> Maybe
> > > we
> > > > > > > >
> > > > > > > > As long as the capability enable is only provided when the PF 
> > > > > > > > has
> > > > > enabled
> > > > > > > > the feature. Then it seems the hardware seems to do the right
> thing.
> > > > > > > >
> > > > > > > > Assume we expose PASID/PRI only when PF has enabled it. It will
> be
> > > the
> > > > > > > > case since the PF driver needs to exist, and IOMMU would have
> set
> > > the
> > > > > > > > PASID/PRI/ATS on PF.
> > > > > > > >
> > > > > > > > 

RE: [PATCH v2 1/3] iommu/uapi: Define uapi version and capabilities

2020-04-14 Thread Tian, Kevin
> From: Jacob Pan 
> Sent: Wednesday, April 15, 2020 6:32 AM
> 
> On Tue, 14 Apr 2020 10:13:04 -0700
> Jacob Pan  wrote:
> 
> > > > >  In any of the proposed solutions, the
> > > > > IOMMU driver is ultimately responsible for validating the user
> > > > > data, so do we want vfio performing the copy_from_user() to an
> > > > > object that could later be assumed to be sanitized, or should
> > > > > vfio just pass a user pointer to make it obvious that the
> > > > > consumer is responsible for all the user protections?  Seems
> > > > > like the latter.
> > > > I like the latter as well.
> > > >
> On a second thought, I think the former is better. Two reasons:
> 
> 1. IOMMU API such as page_response is also used in baremetal. So it is
> not suitable to pass a __user *.
> https://www.spinics.net/lists/arm-kernel/msg798677.html

You can have a wrapped version accepting a __user* and an internal
version for kernel pointers.

> 
> 2. Some data are in the mandatory (fixed offset, never removed or
> extended) portion of the uAPI structure. It is simpler for VFIO to
> extract that and pass it to IOMMU API. For example, the PASID value used
> for unbind_gpasid(). VFIO also need to sanitize the PASID value to make
> sure it belongs to the same VM that did the allocation.

I don't think this makes much difference. If anyway you still plan to
let IOMMU driver parse some user pointers, why not making a clear
split to have it sparse all IOMMU specific fields?

Thanks
Kevin

> 
> 
> > > > >  That still really
> > > > > doesn't address what's in that user data blob yet, but the vfio
> > > > > interface could be:
> > > > >
> > > > > struct {
> > > > >   __u32 argsz;
> > > > >   __u32 flags;
> > > > >   __u8  data[];
> > > > > }
> > > > >
> > > > > Where flags might be partitioned like we do for DEVICE_FEATURE
> > > > > to indicate the format of data and what vfio should do with it,
> > > > > and data might simply be defined as a (__u64 __user *).
> > > > >
> > > > So, __user * will be passed to IOMMU driver if VFIO checks minsz
> > > > include flags and they are valid.
> > > > IOMMU driver can copy the rest based on the mandatory
> > > > version/minsz and flags in the IOMMU uAPI structs.
> > > > Does it sound right? This is really choice #2.
> > >
> > > Sounds like each IOMMU UAPI struct just needs to have an embedded
> > > size and flags field, but yes.
> > >
> > Yes, an argsz field can be added to each UAPI. There are already flags
> > or the equivalent. IOMMU driver can process the __user * based on the
> > argsz, flags, check argsz against offsetofend(iommu_uapi_struct,
> > last_element), etc.;
___
iommu mailing list
iommu@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/iommu


Re: [PATCH v2 1/3] iommu/uapi: Define uapi version and capabilities

2020-04-14 Thread Jacob Pan
On Tue, 14 Apr 2020 10:13:04 -0700
Jacob Pan  wrote:

> > > >  In any of the proposed solutions, the
> > > > IOMMU driver is ultimately responsible for validating the user
> > > > data, so do we want vfio performing the copy_from_user() to an
> > > > object that could later be assumed to be sanitized, or should
> > > > vfio just pass a user pointer to make it obvious that the
> > > > consumer is responsible for all the user protections?  Seems
> > > > like the latter.  
> > > I like the latter as well.
> > > 
On a second thought, I think the former is better. Two reasons:

1. IOMMU API such as page_response is also used in baremetal. So it is
not suitable to pass a __user *.
https://www.spinics.net/lists/arm-kernel/msg798677.html

2. Some data are in the mandatory (fixed offset, never removed or
extended) portion of the uAPI structure. It is simpler for VFIO to
extract that and pass it to IOMMU API. For example, the PASID value used
for unbind_gpasid(). VFIO also need to sanitize the PASID value to make
sure it belongs to the same VM that did the allocation.


> > > >  That still really
> > > > doesn't address what's in that user data blob yet, but the vfio
> > > > interface could be:
> > > > 
> > > > struct {
> > > > __u32 argsz;
> > > > __u32 flags;
> > > > __u8  data[];
> > > > }
> > > > 
> > > > Where flags might be partitioned like we do for DEVICE_FEATURE
> > > > to indicate the format of data and what vfio should do with it,
> > > > and data might simply be defined as a (__u64 __user *).
> > > >   
> > > So, __user * will be passed to IOMMU driver if VFIO checks minsz
> > > include flags and they are valid.
> > > IOMMU driver can copy the rest based on the mandatory
> > > version/minsz and flags in the IOMMU uAPI structs.
> > > Does it sound right? This is really choice #2.
> > 
> > Sounds like each IOMMU UAPI struct just needs to have an embedded
> > size and flags field, but yes.
> >   
> Yes, an argsz field can be added to each UAPI. There are already flags
> or the equivalent. IOMMU driver can process the __user * based on the
> argsz, flags, check argsz against offsetofend(iommu_uapi_struct,
> last_element), etc.;
___
iommu mailing list
iommu@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/iommu


Re: [PATCH v11 01/13] iommu: Introduce attach/detach_pasid_table API

2020-04-14 Thread Jacob Pan
Hi Eric,

There are some discussions about how to size the uAPI data.
https://lkml.org/lkml/2020/4/14/939

I think the problem with the current scheme is that when uAPI data gets
extended, if VFIO continue to use:

minsz = offsetofend(struct vfio_iommu_type1_set_pasid_table, config);
if (copy_from_user(, (void __user *)arg, minsz))

It may copy more data from user than what was setup by the user.

So, as suggested by Alex, we could add argsz to the IOMMU uAPI struct.
So if argsz > minsz, then fail the attach_table since kernel might be
old, doesn't know about the extra data.
If argsz <= minsz, kernel can support the attach_table but must process
the data based on flags or config.

Does it make sense to you?


On Tue, 14 Apr 2020 17:05:55 +0200
Eric Auger  wrote:

> From: Jacob Pan 
> 
> In virtualization use case, when a guest is assigned
> a PCI host device, protected by a virtual IOMMU on the guest,
> the physical IOMMU must be programmed to be consistent with
> the guest mappings. If the physical IOMMU supports two
> translation stages it makes sense to program guest mappings
> onto the first stage/level (ARM/Intel terminology) while the host
> owns the stage/level 2.
> 
> In that case, it is mandated to trap on guest configuration
> settings and pass those to the physical iommu driver.
> 
> This patch adds a new API to the iommu subsystem that allows
> to set/unset the pasid table information.
> 
> A generic iommu_pasid_table_config struct is introduced in
> a new iommu.h uapi header. This is going to be used by the VFIO
> user API.
> 
> Signed-off-by: Jean-Philippe Brucker 
> Signed-off-by: Liu, Yi L 
> Signed-off-by: Ashok Raj 
> Signed-off-by: Jacob Pan 
> Signed-off-by: Eric Auger 
> Reviewed-by: Jean-Philippe Brucker 
> ---
>  drivers/iommu/iommu.c  | 19 ++
>  include/linux/iommu.h  | 18 ++
>  include/uapi/linux/iommu.h | 51
> ++ 3 files changed, 88
> insertions(+)
> 
> diff --git a/drivers/iommu/iommu.c b/drivers/iommu/iommu.c
> index 2b471419e26c..b71ad56f8c99 100644
> --- a/drivers/iommu/iommu.c
> +++ b/drivers/iommu/iommu.c
> @@ -1723,6 +1723,25 @@ int iommu_sva_unbind_gpasid(struct
> iommu_domain *domain, struct device *dev, }
>  EXPORT_SYMBOL_GPL(iommu_sva_unbind_gpasid);
>  
> +int iommu_attach_pasid_table(struct iommu_domain *domain,
> +  struct iommu_pasid_table_config *cfg)
> +{
> + if (unlikely(!domain->ops->attach_pasid_table))
> + return -ENODEV;
> +
> + return domain->ops->attach_pasid_table(domain, cfg);
> +}
> +EXPORT_SYMBOL_GPL(iommu_attach_pasid_table);
> +
> +void iommu_detach_pasid_table(struct iommu_domain *domain)
> +{
> + if (unlikely(!domain->ops->detach_pasid_table))
> + return;
> +
> + domain->ops->detach_pasid_table(domain);
> +}
> +EXPORT_SYMBOL_GPL(iommu_detach_pasid_table);
> +
>  static void __iommu_detach_device(struct iommu_domain *domain,
> struct device *dev)
>  {
> diff --git a/include/linux/iommu.h b/include/linux/iommu.h
> index 7ef8b0bda695..3e1057c3585a 100644
> --- a/include/linux/iommu.h
> +++ b/include/linux/iommu.h
> @@ -248,6 +248,8 @@ struct iommu_iotlb_gather {
>   * @cache_invalidate: invalidate translation caches
>   * @sva_bind_gpasid: bind guest pasid and mm
>   * @sva_unbind_gpasid: unbind guest pasid and mm
> + * @attach_pasid_table: attach a pasid table
> + * @detach_pasid_table: detach the pasid table
>   * @pgsize_bitmap: bitmap of all possible supported page sizes
>   * @owner: Driver module providing these ops
>   */
> @@ -307,6 +309,9 @@ struct iommu_ops {
> void *drvdata);
>   void (*sva_unbind)(struct iommu_sva *handle);
>   int (*sva_get_pasid)(struct iommu_sva *handle);
> + int (*attach_pasid_table)(struct iommu_domain *domain,
> +   struct iommu_pasid_table_config
> *cfg);
> + void (*detach_pasid_table)(struct iommu_domain *domain);
>  
>   int (*page_response)(struct device *dev,
>struct iommu_fault_event *evt,
> @@ -446,6 +451,9 @@ extern int iommu_sva_bind_gpasid(struct
> iommu_domain *domain, struct device *dev, struct
> iommu_gpasid_bind_data *data); extern int
> iommu_sva_unbind_gpasid(struct iommu_domain *domain, struct device
> *dev, ioasid_t pasid); +extern int iommu_attach_pasid_table(struct
> iommu_domain *domain,
> + struct iommu_pasid_table_config
> *cfg); +extern void iommu_detach_pasid_table(struct iommu_domain
> *domain); extern struct iommu_domain *iommu_get_domain_for_dev(struct
> device *dev); extern struct iommu_domain *iommu_get_dma_domain(struct
> device *dev); extern int iommu_map(struct iommu_domain *domain,
> unsigned long iova, @@ -1048,6 +1056,16 @@ iommu_aux_get_pasid(struct
> iommu_domain *domain, struct device *dev) return -ENODEV;
>  }
>  
> +static inline
> +int iommu_attach_pasid_table(struct 

Re: [rfc v2 3/6] dma-pool: dynamically expanding atomic pools

2020-04-14 Thread David Rientjes via iommu
On Tue, 14 Apr 2020, Christoph Hellwig wrote:

> > I'll rely on Christoph to determine whether it makes sense to add some 
> > periodic scavening of the atomic pools, whether that's needed for this to 
> > be merged, or wheter we should enforce some maximum pool size.
> 
> I don't really see the point.  In fact the only part of the series
> I feel uneasy about is the growing of the pools, because it already
> adds a fair amount of complexity that we might not need for simple
> things, but shrinking really doesn't make any sense.  So I'm tempted
> to not ever support shrinking, and even make growing optional code under
> a new config variable.  We'll also need a way to query the current size
> through e.g. a debugfs file.
> 

New debugfs file sounds good, I'll add it.  If we want to disable dynamic 
expansion when the pool is depleted under a new config option, let me 
know.
___
iommu mailing list
iommu@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/iommu


Re: [rfc v2 4/6] dma-direct: atomic allocations must come from atomic coherent pools

2020-04-14 Thread David Rientjes via iommu
On Tue, 14 Apr 2020, Christoph Hellwig wrote:

> > +   /*
> > +* Unencrypted memory must come directly from DMA atomic pools if
> > +* blocking is not allowed.
> > +*/
> > +   if (IS_ENABLED(CONFIG_DMA_COHERENT_POOL) &&
> > +   force_dma_unencrypted(dev) && !gfpflags_allow_blocking(gfp)) {
> > +   ret = dma_alloc_from_pool(dev, PAGE_ALIGN(size), , gfp);
> > +   if (!ret)
> > +   return NULL;
> > +   goto done;
> > +   }
> > +
> > if (IS_ENABLED(CONFIG_DMA_DIRECT_REMAP) &&
> > dma_alloc_need_uncached(dev, attrs) &&
> > !gfpflags_allow_blocking(gfp)) {
> 
> Can we keep a single conditional for the pool allocations?  Maybe
> add a new dma_alloc_from_pool helper ala:
> 
> static inline bool dma_alloc_from_pool(struct device *dev, gfp_t gfp)
> {
>   if (!IS_ENABLED(CONFIG_DMA_COHERENT_POOL))
>   return false;
>   if (gfpflags_allow_blocking(gfp))
>   return false;
>   if (force_dma_unencrypted(dev))
>   return true;
>   if (dma_alloc_need_uncached(dev))
>   return true;
> }

Looks good, fixed.  I renamed it to dma_should_alloc_from_pool() to avoid 
confusing it with the actual allocation function and added a 
dma_should_free_from_pool() as well.

diff --git a/kernel/dma/direct.c b/kernel/dma/direct.c
--- a/kernel/dma/direct.c
+++ b/kernel/dma/direct.c
@@ -75,6 +75,39 @@ static bool dma_coherent_ok(struct device *dev, phys_addr_t 
phys, size_t size)
min_not_zero(dev->coherent_dma_mask, 
dev->bus_dma_limit);
 }
 
+/*
+ * Decrypting memory is allowed to block, so if this device requires
+ * unencrypted memory it must come from atomic pools.
+ */
+static inline bool dma_should_alloc_from_pool(struct device *dev, gfp_t gfp,
+ unsigned long attrs)
+{
+   if (!IS_ENABLED(CONFIG_DMA_COHERENTPOOL))
+   return false;
+   if (gfpflags_allow_blocking(gfp))
+   return false;
+   if (force_dma_unencrypted(dev))
+   return true;
+   if (!IS_ENABLED(CONFIG_DMA_DIRECT_REMAP))
+   return false;
+   if (dma_alloc_need_uncached(dev, attrs))
+   return true;
+   return false;
+}
+
+static inline bool dma_should_free_from_pool(struct device *dev,
+unsigned long attrs)
+{
+   if (IS_ENABLED(CONFIG_DMA_COHERENT_POOL))
+   return true;
+   if ((attrs & DMA_ATTR_NO_KERNEL_MAPPING) &&
+   !force_dma_unencrypted(dev))
+   return false;
+   if (IS_ENABLED(CONFIG_DMA_DIRECT_REMAP))
+   return true;
+   return false;
+}
+
 struct page *__dma_direct_alloc_pages(struct device *dev, size_t size,
gfp_t gfp, unsigned long attrs)
 {
@@ -124,9 +157,7 @@ void *dma_direct_alloc_pages(struct device *dev, size_t 
size,
struct page *page;
void *ret;
 
-   if (IS_ENABLED(CONFIG_DMA_DIRECT_REMAP) &&
-   dma_alloc_need_uncached(dev, attrs) &&
-   !gfpflags_allow_blocking(gfp)) {
+   if (dma_should_alloc_from_pool(dev, gfp, attrs)) {
ret = dma_alloc_from_pool(dev, PAGE_ALIGN(size), , gfp);
if (!ret)
return NULL;
@@ -202,6 +233,11 @@ void dma_direct_free_pages(struct device *dev, size_t 
size, void *cpu_addr,
 {
unsigned int page_order = get_order(size);
 
+   /* If cpu_addr is not from an atomic pool, dma_free_from_pool() fails */
+   if (dma_should_free_from_pool(dev, attrs) &&
+   dma_free_from_pool(dev, cpu_addr, PAGE_ALIGN(size)))
+   return;
+
if ((attrs & DMA_ATTR_NO_KERNEL_MAPPING) &&
!force_dma_unencrypted(dev)) {
/* cpu_addr is a struct page cookie, not a kernel address */
@@ -209,10 +245,6 @@ void dma_direct_free_pages(struct device *dev, size_t 
size, void *cpu_addr,
return;
}
 
-   if (IS_ENABLED(CONFIG_DMA_DIRECT_REMAP) &&
-   dma_free_from_pool(dev, cpu_addr, PAGE_ALIGN(size)))
-   return;
-
if (force_dma_unencrypted(dev))
set_memory_encrypted((unsigned long)cpu_addr, 1 << page_order);
 
___
iommu mailing list
iommu@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/iommu


Re: [rfc v2 4/6] dma-direct: atomic allocations must come from atomic coherent pools

2020-04-14 Thread David Rientjes via iommu
On Thu, 9 Apr 2020, Tom Lendacky wrote:

> > When a device required unencrypted memory and the context does not allow
> 
> required => requires
> 

Fixed, thanks.

> > blocking, memory must be returned from the atomic coherent pools.
> > 
> > This avoids the remap when CONFIG_DMA_DIRECT_REMAP is not enabled and the
> > config only requires CONFIG_DMA_COHERENT_POOL.  This will be used for
> > CONFIG_AMD_MEM_ENCRYPT in a subsequent patch.
> > 
> > Keep all memory in these pools unencrypted.
> > 
> > Signed-off-by: David Rientjes 
> > ---
> >   kernel/dma/direct.c | 16 
> >   kernel/dma/pool.c   | 15 +--
> >   2 files changed, 29 insertions(+), 2 deletions(-)
> > 
> > diff --git a/kernel/dma/direct.c b/kernel/dma/direct.c
> > index 70800ca64f13..44165263c185 100644
> > --- a/kernel/dma/direct.c
> > +++ b/kernel/dma/direct.c
> > @@ -124,6 +124,18 @@ void *dma_direct_alloc_pages(struct device *dev, size_t
> > size,
> > struct page *page;
> > void *ret;
> >   + /*
> > +* Unencrypted memory must come directly from DMA atomic pools if
> > +* blocking is not allowed.
> > +*/
> > +   if (IS_ENABLED(CONFIG_DMA_COHERENT_POOL) &&
> > +   force_dma_unencrypted(dev) && !gfpflags_allow_blocking(gfp)) {
> > +   ret = dma_alloc_from_pool(dev, PAGE_ALIGN(size), , gfp);
> > +   if (!ret)
> > +   return NULL;
> > +   goto done;
> > +   }
> > +
> > if (IS_ENABLED(CONFIG_DMA_DIRECT_REMAP) &&
> > dma_alloc_need_uncached(dev, attrs) &&
> > !gfpflags_allow_blocking(gfp)) {
> > @@ -203,6 +215,10 @@ void dma_direct_free_pages(struct device *dev, size_t
> > size, void *cpu_addr,
> >   {
> > unsigned int page_order = get_order(size);
> >   + if (IS_ENABLED(CONFIG_DMA_COHERENT_POOL) &&
> > +   dma_free_from_pool(dev, cpu_addr, PAGE_ALIGN(size)))
> > +   return;
> > +
> > if ((attrs & DMA_ATTR_NO_KERNEL_MAPPING) &&
> > !force_dma_unencrypted(dev)) {
> > /* cpu_addr is a struct page cookie, not a kernel address */
> > diff --git a/kernel/dma/pool.c b/kernel/dma/pool.c
> > index e14c5a2da734..6685ab89cfa7 100644
> > --- a/kernel/dma/pool.c
> > +++ b/kernel/dma/pool.c
> > @@ -9,6 +9,7 @@
> >   #include 
> >   #include 
> >   #include 
> > +#include 
> >   #include 
> >   #include 
> >   #include 
> > @@ -55,12 +56,20 @@ static int atomic_pool_expand(struct gen_pool *pool,
> > size_t pool_size,
> > arch_dma_prep_coherent(page, pool_size);
> >   +#ifdef CONFIG_DMA_DIRECT_REMAP
> > addr = dma_common_contiguous_remap(page, pool_size,
> >pgprot_dmacoherent(PAGE_KERNEL),
> >__builtin_return_address(0));
> > if (!addr)
> > goto free_page;
> > -
> > +#else
> > +   addr = page_to_virt(page);
> > +#endif
> > +   /*
> > +* Memory in the atomic DMA pools must be unencrypted, the pools do
> > not
> > +* shrink so no re-encryption occurs in dma_direct_free_pages().
> > +*/
> > +   set_memory_decrypted((unsigned long)page_to_virt(page), 1 << order);
> > ret = gen_pool_add_virt(pool, (unsigned long)addr, page_to_phys(page),
> > pool_size, NUMA_NO_NODE);
> > if (ret)
> > @@ -69,8 +78,10 @@ static int atomic_pool_expand(struct gen_pool *pool,
> > size_t pool_size,
> > return 0;
> > remove_mapping:
> > +#ifdef CONFIG_DMA_DIRECT_REMAP
> > dma_common_free_remap(addr, pool_size);
> 
> You're about to free the memory, but you've called set_memory_decrypted()
> against it, so you need to do a set_memory_encrypted() to bring it back to a
> state ready for allocation again.
> 

Ah, good catch, thanks.  I notice that I should also be checking the 
return value of set_memory_decrypted() because pages added to the coherent 
pools *must* be unencrypted.  If it fails, we fail the expansion.

And do the same thing for set_memory_encrypted(), which would be a bizarre 
situation (decrypt succeeded, encrypt failed), by simply leaking the page.

diff --git a/kernel/dma/pool.c b/kernel/dma/pool.c
--- a/kernel/dma/pool.c
+++ b/kernel/dma/pool.c
@@ -7,6 +7,7 @@
 #include 
 #include 
 #include 
+#include 
 #include 
 #include 
 #include 
@@ -53,22 +54,42 @@ static int atomic_pool_expand(struct gen_pool *pool, size_t 
pool_size,
 
arch_dma_prep_coherent(page, pool_size);
 
+#ifdef CONFIG_DMA_DIRECT_REMAP
addr = dma_common_contiguous_remap(page, pool_size,
   pgprot_dmacoherent(PAGE_KERNEL),
   __builtin_return_address(0));
if (!addr)
goto free_page;
-
+#else
+   addr = page_to_virt(page);
+#endif
+   /*
+* Memory in the atomic DMA pools must be unencrypted, the pools do not
+* shrink so no re-encryption occurs in dma_direct_free_pages().
+*/
+   ret = set_memory_decrypted((unsigned long)page_to_virt(page),
+  

Re: [PATCH v5 01/25] mm/mmu_notifiers: pass private data down to alloc_notifier()

2020-04-14 Thread Jason Gunthorpe
On Tue, Apr 14, 2020 at 07:02:29PM +0200, Jean-Philippe Brucker wrote:
> The new allocation scheme introduced by commit 2c7933f53f6b
> ("mm/mmu_notifiers: add a get/put scheme for the registration") provides
> a convenient way for users to attach notifier data to an mm. However, it
> would be even better to create this notifier data atomically.
> 
> Since the alloc_notifier() callback only takes an mm argument at the
> moment, some users have to perform the allocation in two times.
> alloc_notifier() initially creates an incomplete structure, which is
> then finalized using more context once mmu_notifier_get() returns. This
> second step requires extra care to order memory accesses against live
> invalidation.
> 
> The IOMMU SVA module, which attaches an mm to multiple devices,
> exemplifies this situation. In essence it does:
> 
>   mmu_notifier_get()
> alloc_notifier()
>A = kzalloc()
> /* MMU notifier is published */
>   A->ctx = ctx;   // (1)
>   device->A = A;
>   list_add_rcu(device, A->devices);   // (2)
> 
> The invalidate notifier, which may start running before A is fully
> initialized, does the following:
> 
>   io_mm_invalidate(A)
> list_for_each_entry_rcu(device, A->devices)
>   device->invalidate(A->ctx)

This could probably also have been reliably fixed by not having A->ctx
be allocated memory, but inlined into the notifier struct

But I can't think of a down side to not add a params either.

Reviewed-by: Jason Gunthorpe 

Regards,
Jason
___
iommu mailing list
iommu@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/iommu


Re: [PATCH v5 23/25] PCI/ATS: Add PRI stubs

2020-04-14 Thread Kuppuswamy, Sathyanarayanan

Hi,

On 4/14/20 10:02 AM, Jean-Philippe Brucker wrote:

The SMMUv3 driver, which can be built without CONFIG_PCI, will soon gain
support for PRI.  Partially revert commit c6e9aefbf9db ("PCI/ATS: Remove
unused PRI and PASID stubs") to re-introduce the PRI stubs, and avoid
adding more #ifdefs to the SMMU driver.

Acked-by: Bjorn Helgaas 
Signed-off-by: Jean-Philippe Brucker 
Reviewed-by: Kuppuswamy Sathyanarayanan 


---
  include/linux/pci-ats.h | 8 
  1 file changed, 8 insertions(+)

diff --git a/include/linux/pci-ats.h b/include/linux/pci-ats.h
index f75c307f346de..e9e266df9b37c 100644
--- a/include/linux/pci-ats.h
+++ b/include/linux/pci-ats.h
@@ -28,6 +28,14 @@ int pci_enable_pri(struct pci_dev *pdev, u32 reqs);
  void pci_disable_pri(struct pci_dev *pdev);
  int pci_reset_pri(struct pci_dev *pdev);
  int pci_prg_resp_pasid_required(struct pci_dev *pdev);
+#else /* CONFIG_PCI_PRI */
+static inline int pci_enable_pri(struct pci_dev *pdev, u32 reqs)
+{ return -ENODEV; }
+static inline void pci_disable_pri(struct pci_dev *pdev) { }
+static inline int pci_reset_pri(struct pci_dev *pdev)
+{ return -ENODEV; }
+static inline int pci_prg_resp_pasid_required(struct pci_dev *pdev)
+{ return 0; }
  #endif /* CONFIG_PCI_PRI */
  
  #ifdef CONFIG_PCI_PASID



___
iommu mailing list
iommu@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/iommu


Re: [PATCH v5 24/25] PCI/ATS: Export PRI functions

2020-04-14 Thread Kuppuswamy, Sathyanarayanan



Hi,
On 4/14/20 10:02 AM, Jean-Philippe Brucker wrote:

The SMMUv3 driver uses pci_{enable,disable}_pri() and related
functions. Export those functions to allow the driver to be built as a
module.

Acked-by: Bjorn Helgaas 
Signed-off-by: Jean-Philippe Brucker 
Reviewed-by: Kuppuswamy Sathyanarayanan 


---
  drivers/pci/ats.c | 4 
  1 file changed, 4 insertions(+)

diff --git a/drivers/pci/ats.c b/drivers/pci/ats.c
index bbfd0d42b8b97..fc8fc6fc8bd55 100644
--- a/drivers/pci/ats.c
+++ b/drivers/pci/ats.c
@@ -197,6 +197,7 @@ void pci_pri_init(struct pci_dev *pdev)
if (status & PCI_PRI_STATUS_PASID)
pdev->pasid_required = 1;
  }
+EXPORT_SYMBOL_GPL(pci_pri_init);
  
  /**

   * pci_enable_pri - Enable PRI capability
@@ -243,6 +244,7 @@ int pci_enable_pri(struct pci_dev *pdev, u32 reqs)
  
  	return 0;

  }
+EXPORT_SYMBOL_GPL(pci_enable_pri);
  
  /**

   * pci_disable_pri - Disable PRI capability
@@ -322,6 +324,7 @@ int pci_reset_pri(struct pci_dev *pdev)
  
  	return 0;

  }
+EXPORT_SYMBOL_GPL(pci_reset_pri);
  
  /**

   * pci_prg_resp_pasid_required - Return PRG Response PASID Required bit
@@ -337,6 +340,7 @@ int pci_prg_resp_pasid_required(struct pci_dev *pdev)
  
  	return pdev->pasid_required;

  }
+EXPORT_SYMBOL_GPL(pci_prg_resp_pasid_required);
  #endif /* CONFIG_PCI_PRI */
  
  #ifdef CONFIG_PCI_PASID



___
iommu mailing list
iommu@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/iommu


Re: [PATCH] dt-bndings: iommu: renesas, ipmmu-vmsa: convert to json-schema

2020-04-14 Thread Robin Murphy

On 2020-04-13 11:25 am, Yoshihiro Shimoda wrote:
[...]

-Each bus master connected to an IPMMU must reference the IPMMU in its device
-node with the following property:
-
-  - iommus: A reference to the IPMMU in two cells. The first cell is a phandle
-to the IPMMU and the second cell the number of the micro-TLB that the
-device is connected to.


This definition of what the phandle argument means...

[...]

+  '#iommu-cells':
+const: 1

> +

...deserves to be captured in a description here.

Robin.
___
iommu mailing list
iommu@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/iommu


Re: [PATCH v2 1/3] iommu/uapi: Define uapi version and capabilities

2020-04-14 Thread Jacob Pan
On Tue, 14 Apr 2020 10:13:58 -0600
Alex Williamson  wrote:

> On Mon, 13 Apr 2020 22:05:15 -0700
> Jacob Pan  wrote:
> 
> > Hi Alex,
> > Thanks a lot for the feedback, my comments inline.
> > 
> > On Mon, 13 Apr 2020 16:21:29 -0600
> > Alex Williamson  wrote:
> >   
> > > On Mon, 13 Apr 2020 13:41:57 -0700
> > > Jacob Pan  wrote:
> > > 
> > > > Hi All,
> > > > 
> > > > Just a gentle reminder, any feedback on the options I listed
> > > > below? New ideas will be even better.
> > > > 
> > > > Christoph, does the explanation make sense to you? We do have
> > > > the capability/flag based scheme for IOMMU API extension, the
> > > > version is mainly used for size lookup. Compatibility checking
> > > > is another use of the version, it makes checking easy when a
> > > > vIOMMU is launched.
> > > > 
> > > > Thanks,
> > > > 
> > > > Jacob
> > > > 
> > > > On Thu, 2 Apr 2020 11:36:04 -0700
> > > > Jacob Pan  wrote:
> > > >   
> > > > > On Wed, 1 Apr 2020 05:32:21 +
> > > > > "Tian, Kevin"  wrote:
> > > > > 
> > > > > > > From: Jacob Pan 
> > > > > > > Sent: Tuesday, March 31, 2020 11:55 PM
> > > > > > > 
> > > > > > > On Tue, 31 Mar 2020 06:06:38 +
> > > > > > > "Tian, Kevin"  wrote:
> > > > > > > 
> > > > > > > > > From: Jacob Pan 
> > > > > > > > > Sent: Tuesday, March 31, 2020 12:08 AM
> > > > > > > > >
> > > > > > > > > On Mon, 30 Mar 2020 05:40:40 +
> > > > > > > > > "Tian, Kevin"  wrote:
> > > > > > > > >
> > > > > > > > > > > From: Jacob Pan 
> > > > > > > > > > > Sent: Saturday, March 28, 2020 7:54 AM
> > > > > > > > > > >
> > > > > > > > > > > On Fri, 27 Mar 2020 00:47:02 -0700
> > > > > > > > > > > Christoph Hellwig  wrote:
> > > > > > > > > > >
> > > > > > > > > > > > On Fri, Mar 27, 2020 at 02:49:55AM +, Tian,
> > > > > > > > > > > > Kevin wrote:
> > > > > > > > > > > > > If those API calls are inter-dependent for
> > > > > > > > > > > > > composing a feature (e.g. SVA), shouldn't we
> > > > > > > > > > > > > need a way to check them together before
> > > > > > > > > > > > > exposing the feature to the guest, e.g.
> > > > > > > > > > > > > through a iommu_get_uapi_capabilities
> > > > > > > > > > > > > interface?
> > > > > > > > > > > >
> > > > > > > > > > > > Yes, that makes sense.  The important bit is to
> > > > > > > > > > > > have a capability flags and not version
> > > > > > > > > > > > numbers.
> > > > > > > > > > >
> > > > > > > > > > > The challenge is that there are two consumers in
> > > > > > > > > > > the kernel for this. 1. VFIO only look for
> > > > > > > > > > > compatibility, and size of each data struct such
> > > > > > > > > > > that it can copy_from_user.
> > > > > > > > > > >
> > > > > > > > > > > 2. IOMMU driver, the "real consumer" of the
> > > > > > > > > > > content.
> > > > > > > > > > >
> > > > > > > > > > > For 2, I agree and we do plan to use the
> > > > > > > > > > > capability flags to check content and maintain
> > > > > > > > > > > backward compatibility etc.
> > > > > > > > > > >
> > > > > > > > > > > For VFIO, it is difficult to do size look up
> > > > > > > > > > > based on capability flags.
> > > > > > > > > >
> > > > > > > > > > Can you elaborate the difficulty in VFIO? if, as
> > > > > > > > > > Christoph Hellwig pointed out, version number is
> > > > > > > > > > already avoided everywhere, it is interesting to
> > > > > > > > > > know whether this work becomes a real exception or
> > > > > > > > > > just requires a different mindset. 
> > > > > > > > > From VFIO p.o.v. the IOMMU UAPI data is opaque, it
> > > > > > > > > only needs to do two things:
> > > > > > > > > 1. is the UAPI compatible?
> > > > > > > > > 2. what is the size to copy?
> > > > > > > > >
> > > > > > > > > If you look at the version number, this is really a
> > > > > > > > > "version as size" lookup, as provided by the helper
> > > > > > > > > function in this patch. An example can be the newly
> > > > > > > > > introduced clone3 syscall.
> > > > > > > > > https://lwn.net/Articles/792628/ In clone3, new
> > > > > > > > > version must have new size. The slight difference
> > > > > > > > > here is that, unlike clone3, we have multiple data
> > > > > > > > > structures instead of a single struct clone_args {}.
> > > > > > > > > And each struct has flags to enumerate its contents
> > > > > > > > > besides size.
> > > > > > > >
> > > > > > > > Thanks for providing that link. However clone3 doesn't
> > > > > > > > include a version field to do "version as size" lookup.
> > > > > > > > Instead, as you said, it includes a size parameter which
> > > > > > > > sounds like the option 3 (argsz) listed below.
> > > > > > > >
> > > > > > > Right, there is no version in clone3. size = version. I
> > > > > > > view this as a 1:1 lookup.
> > > > > > > 
> > > > > > > > >
> > > > > > > > > Besides breaching data abstraction, if VFIO has to
> > > > > > > > 

[PATCH v5 14/25] iommu/arm-smmu-v3: Add support for VHE

2020-04-14 Thread Jean-Philippe Brucker
ARMv8.1 extensions added Virtualization Host Extensions (VHE), which allow
to run a host kernel at EL2. When using normal DMA, Device and CPU address
spaces are dissociated, and do not need to implement the same
capabilities, so VHE hasn't been used in the SMMU until now.

With shared address spaces however, ASIDs are shared between MMU and SMMU,
and broadcast TLB invalidations issued by a CPU are taken into account by
the SMMU. TLB entries on both sides need to have identical exception level
in order to be cleared with a single invalidation.

When the CPU is using VHE, enable VHE in the SMMU for all STEs. Normal DMA
mappings will need to use TLBI_EL2 commands instead of TLBI_NH, but
shouldn't be otherwise affected by this change.

Signed-off-by: Jean-Philippe Brucker 
---
v4->v5: bump feature bit
---
 drivers/iommu/arm-smmu-v3.c | 31 ++-
 1 file changed, 26 insertions(+), 5 deletions(-)

diff --git a/drivers/iommu/arm-smmu-v3.c b/drivers/iommu/arm-smmu-v3.c
index 8fbc5da133ae4..21d458d817fc2 100644
--- a/drivers/iommu/arm-smmu-v3.c
+++ b/drivers/iommu/arm-smmu-v3.c
@@ -13,6 +13,7 @@
 #include 
 #include 
 #include 
+#include 
 #include 
 #include 
 #include 
@@ -480,6 +481,8 @@ struct arm_smmu_cmdq_ent {
#define CMDQ_OP_TLBI_NH_ASID0x11
#define CMDQ_OP_TLBI_NH_VA  0x12
#define CMDQ_OP_TLBI_EL2_ALL0x20
+   #define CMDQ_OP_TLBI_EL2_ASID   0x21
+   #define CMDQ_OP_TLBI_EL2_VA 0x22
#define CMDQ_OP_TLBI_S12_VMALL  0x28
#define CMDQ_OP_TLBI_S2_IPA 0x2a
#define CMDQ_OP_TLBI_NSNH_ALL   0x30
@@ -651,6 +654,7 @@ struct arm_smmu_device {
 #define ARM_SMMU_FEAT_STALL_FORCE  (1 << 13)
 #define ARM_SMMU_FEAT_VAX  (1 << 14)
 #define ARM_SMMU_FEAT_RANGE_INV(1 << 15)
+#define ARM_SMMU_FEAT_E2H  (1 << 16)
u32 features;
 
 #define ARM_SMMU_OPT_SKIP_PREFETCH (1 << 0)
@@ -924,6 +928,8 @@ static int arm_smmu_cmdq_build_cmd(u64 *cmd, struct 
arm_smmu_cmdq_ent *ent)
cmd[0] |= FIELD_PREP(CMDQ_TLBI_0_NUM, ent->tlbi.num);
cmd[0] |= FIELD_PREP(CMDQ_TLBI_0_SCALE, ent->tlbi.scale);
cmd[0] |= FIELD_PREP(CMDQ_TLBI_0_VMID, ent->tlbi.vmid);
+   /* Fallthrough */
+   case CMDQ_OP_TLBI_EL2_VA:
cmd[0] |= FIELD_PREP(CMDQ_TLBI_0_ASID, ent->tlbi.asid);
cmd[1] |= FIELD_PREP(CMDQ_TLBI_1_LEAF, ent->tlbi.leaf);
cmd[1] |= FIELD_PREP(CMDQ_TLBI_1_TTL, ent->tlbi.ttl);
@@ -945,6 +951,9 @@ static int arm_smmu_cmdq_build_cmd(u64 *cmd, struct 
arm_smmu_cmdq_ent *ent)
case CMDQ_OP_TLBI_S12_VMALL:
cmd[0] |= FIELD_PREP(CMDQ_TLBI_0_VMID, ent->tlbi.vmid);
break;
+   case CMDQ_OP_TLBI_EL2_ASID:
+   cmd[0] |= FIELD_PREP(CMDQ_TLBI_0_ASID, ent->tlbi.asid);
+   break;
case CMDQ_OP_ATC_INV:
cmd[0] |= FIELD_PREP(CMDQ_0_SSV, ent->substream_valid);
cmd[0] |= FIELD_PREP(CMDQ_ATC_0_GLOBAL, ent->atc.global);
@@ -1538,7 +1547,8 @@ static int arm_smmu_cmdq_batch_submit(struct 
arm_smmu_device *smmu,
 static void arm_smmu_tlb_inv_asid(struct arm_smmu_device *smmu, u16 asid)
 {
struct arm_smmu_cmdq_ent cmd = {
-   .opcode = CMDQ_OP_TLBI_NH_ASID,
+   .opcode = smmu->features & ARM_SMMU_FEAT_E2H ?
+   CMDQ_OP_TLBI_EL2_ASID : CMDQ_OP_TLBI_NH_ASID,
.tlbi.asid = asid,
};
 
@@ -2093,13 +2103,16 @@ static void arm_smmu_write_strtab_ent(struct 
arm_smmu_master *master, u32 sid,
}
 
if (s1_cfg) {
+   int strw = smmu->features & ARM_SMMU_FEAT_E2H ?
+   STRTAB_STE_1_STRW_EL2 : STRTAB_STE_1_STRW_NSEL1;
+
BUG_ON(ste_live);
dst[1] = cpu_to_le64(
 FIELD_PREP(STRTAB_STE_1_S1DSS, 
STRTAB_STE_1_S1DSS_SSID0) |
 FIELD_PREP(STRTAB_STE_1_S1CIR, 
STRTAB_STE_1_S1C_CACHE_WBRA) |
 FIELD_PREP(STRTAB_STE_1_S1COR, 
STRTAB_STE_1_S1C_CACHE_WBRA) |
 FIELD_PREP(STRTAB_STE_1_S1CSH, ARM_SMMU_SH_ISH) |
-FIELD_PREP(STRTAB_STE_1_STRW, 
STRTAB_STE_1_STRW_NSEL1));
+FIELD_PREP(STRTAB_STE_1_STRW, strw));
 
if (smmu->features & ARM_SMMU_FEAT_STALLS &&
   !(smmu->features & ARM_SMMU_FEAT_STALL_FORCE))
@@ -2495,7 +2508,8 @@ static void arm_smmu_tlb_inv_range(unsigned long iova, 
size_t size,
return;
 
if (smmu_domain->stage == ARM_SMMU_DOMAIN_S1) {
-   cmd.opcode  = CMDQ_OP_TLBI_NH_VA;
+   cmd.opcode  = smmu->features & ARM_SMMU_FEAT_E2H ?
+ CMDQ_OP_TLBI_EL2_VA : CMDQ_OP_TLBI_NH_VA;
cmd.tlbi.asid   = smmu_domain->s1_cfg.cd.asid;
 

[PATCH v5 24/25] PCI/ATS: Export PRI functions

2020-04-14 Thread Jean-Philippe Brucker
The SMMUv3 driver uses pci_{enable,disable}_pri() and related
functions. Export those functions to allow the driver to be built as a
module.

Acked-by: Bjorn Helgaas 
Signed-off-by: Jean-Philippe Brucker 
---
 drivers/pci/ats.c | 4 
 1 file changed, 4 insertions(+)

diff --git a/drivers/pci/ats.c b/drivers/pci/ats.c
index bbfd0d42b8b97..fc8fc6fc8bd55 100644
--- a/drivers/pci/ats.c
+++ b/drivers/pci/ats.c
@@ -197,6 +197,7 @@ void pci_pri_init(struct pci_dev *pdev)
if (status & PCI_PRI_STATUS_PASID)
pdev->pasid_required = 1;
 }
+EXPORT_SYMBOL_GPL(pci_pri_init);
 
 /**
  * pci_enable_pri - Enable PRI capability
@@ -243,6 +244,7 @@ int pci_enable_pri(struct pci_dev *pdev, u32 reqs)
 
return 0;
 }
+EXPORT_SYMBOL_GPL(pci_enable_pri);
 
 /**
  * pci_disable_pri - Disable PRI capability
@@ -322,6 +324,7 @@ int pci_reset_pri(struct pci_dev *pdev)
 
return 0;
 }
+EXPORT_SYMBOL_GPL(pci_reset_pri);
 
 /**
  * pci_prg_resp_pasid_required - Return PRG Response PASID Required bit
@@ -337,6 +340,7 @@ int pci_prg_resp_pasid_required(struct pci_dev *pdev)
 
return pdev->pasid_required;
 }
+EXPORT_SYMBOL_GPL(pci_prg_resp_pasid_required);
 #endif /* CONFIG_PCI_PRI */
 
 #ifdef CONFIG_PCI_PASID
-- 
2.26.0

___
iommu mailing list
iommu@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/iommu


[PATCH v5 03/25] iommu: Add a page fault handler

2020-04-14 Thread Jean-Philippe Brucker
Some systems allow devices to handle I/O Page Faults in the core mm. For
example systems implementing the PCI PRI extension or Arm SMMU stall
model. Infrastructure for reporting these recoverable page faults was
recently added to the IOMMU core. Add a page fault handler for host SVA.

IOMMU driver can now instantiate several fault workqueues and link them to
IOPF-capable devices. Drivers can choose between a single global
workqueue, one per IOMMU device, one per low-level fault queue, one per
domain, etc.

When it receives a fault event, supposedly in an IRQ handler, the IOMMU
driver reports the fault using iommu_report_device_fault(), which calls
the registered handler. The page fault handler then calls the mm fault
handler, and reports either success or failure with iommu_page_response().
When the handler succeeded, the IOMMU retries the access.

The iopf_param pointer could be embedded into iommu_fault_param. But
putting iopf_param into the iommu_param structure allows us not to care
about ordering between calls to iopf_queue_add_device() and
iommu_register_device_fault_handler().

Signed-off-by: Jean-Philippe Brucker 
---
v4->v5: Fix 'busy' refcount
---
 drivers/iommu/Kconfig  |   4 +
 drivers/iommu/Makefile |   1 +
 include/linux/iommu.h  |  60 +
 drivers/iommu/io-pgfault.c | 452 +
 4 files changed, 517 insertions(+)
 create mode 100644 drivers/iommu/io-pgfault.c

diff --git a/drivers/iommu/Kconfig b/drivers/iommu/Kconfig
index e81842f59b037..bf620bf48da03 100644
--- a/drivers/iommu/Kconfig
+++ b/drivers/iommu/Kconfig
@@ -109,6 +109,10 @@ config IOMMU_SVA
select IOMMU_API
select MMU_NOTIFIER
 
+config IOMMU_PAGE_FAULT
+   bool
+   select IOMMU_API
+
 config FSL_PAMU
bool "Freescale IOMMU support"
depends on PCI
diff --git a/drivers/iommu/Makefile b/drivers/iommu/Makefile
index 40c800dd4e3ef..bf5cb4ee84093 100644
--- a/drivers/iommu/Makefile
+++ b/drivers/iommu/Makefile
@@ -4,6 +4,7 @@ obj-$(CONFIG_IOMMU_API) += iommu-traces.o
 obj-$(CONFIG_IOMMU_API) += iommu-sysfs.o
 obj-$(CONFIG_IOMMU_DEBUGFS) += iommu-debugfs.o
 obj-$(CONFIG_IOMMU_DMA) += dma-iommu.o
+obj-$(CONFIG_IOMMU_PAGE_FAULT) += io-pgfault.o
 obj-$(CONFIG_IOMMU_IO_PGTABLE) += io-pgtable.o
 obj-$(CONFIG_IOMMU_IO_PGTABLE_ARMV7S) += io-pgtable-arm-v7s.o
 obj-$(CONFIG_IOMMU_IO_PGTABLE_LPAE) += io-pgtable-arm.o
diff --git a/include/linux/iommu.h b/include/linux/iommu.h
index 167e468dd3510..5a3d092c2568a 100644
--- a/include/linux/iommu.h
+++ b/include/linux/iommu.h
@@ -343,12 +343,21 @@ struct iommu_fault_param {
struct mutex lock;
 };
 
+/**
+ * iopf_queue_flush_t - Flush low-level page fault queue
+ *
+ * Report all faults currently pending in the low-level page fault queue
+ */
+struct iopf_queue;
+typedef int (*iopf_queue_flush_t)(void *cookie, struct device *dev, int pasid);
+
 /**
  * struct dev_iommu - Collection of per-device IOMMU data
  *
  * @fault_param: IOMMU detected device fault reporting data
  * @sva_param:  IOMMU parameter for SVA
  * @sva_lock:   protects @sva_param
+ * @iopf_param: I/O Page Fault queue and data
  * @fwspec: IOMMU fwspec data
  * @priv:   IOMMU Driver private data
  *
@@ -360,6 +369,7 @@ struct dev_iommu {
struct iommu_fault_param*fault_param;
struct iommu_sva_param  *sva_param;
struct mutexsva_lock;
+   struct iopf_device_param*iopf_param;
struct iommu_fwspec *fwspec;
void*priv;
 };
@@ -1071,4 +1081,54 @@ void iommu_debugfs_setup(void);
 static inline void iommu_debugfs_setup(void) {}
 #endif
 
+#ifdef CONFIG_IOMMU_PAGE_FAULT
+extern int iommu_queue_iopf(struct iommu_fault *fault, void *cookie);
+
+extern int iopf_queue_add_device(struct iopf_queue *queue, struct device *dev);
+extern int iopf_queue_remove_device(struct iopf_queue *queue,
+   struct device *dev);
+extern int iopf_queue_flush_dev(struct device *dev, int pasid);
+extern struct iopf_queue *
+iopf_queue_alloc(const char *name, iopf_queue_flush_t flush, void *cookie);
+extern void iopf_queue_free(struct iopf_queue *queue);
+extern int iopf_queue_discard_partial(struct iopf_queue *queue);
+#else /* CONFIG_IOMMU_PAGE_FAULT */
+static inline int iommu_queue_iopf(struct iommu_fault *fault, void *cookie)
+{
+   return -ENODEV;
+}
+
+static inline int iopf_queue_add_device(struct iopf_queue *queue,
+   struct device *dev)
+{
+   return -ENODEV;
+}
+
+static inline int iopf_queue_remove_device(struct iopf_queue *queue,
+  struct device *dev)
+{
+   return -ENODEV;
+}
+
+static inline int iopf_queue_flush_dev(struct device *dev, int pasid)
+{
+   return -ENODEV;
+}
+
+static inline struct iopf_queue *
+iopf_queue_alloc(const char *name, iopf_queue_flush_t flush, void *cookie)
+{
+   return 

[PATCH v5 21/25] dt-bindings: document stall property for IOMMU masters

2020-04-14 Thread Jean-Philippe Brucker
On ARM systems, some platform devices behind an IOMMU may support stall,
which is the ability to recover from page faults. Let the firmware tell us
when a device supports stall.

Reviewed-by: Rob Herring 
Signed-off-by: Jean-Philippe Brucker 
---
 .../devicetree/bindings/iommu/iommu.txt| 18 ++
 1 file changed, 18 insertions(+)

diff --git a/Documentation/devicetree/bindings/iommu/iommu.txt 
b/Documentation/devicetree/bindings/iommu/iommu.txt
index 3c36334e4f942..26ba9e530f138 100644
--- a/Documentation/devicetree/bindings/iommu/iommu.txt
+++ b/Documentation/devicetree/bindings/iommu/iommu.txt
@@ -92,6 +92,24 @@ Optional properties:
   tagging DMA transactions with an address space identifier. By default,
   this is 0, which means that the device only has one address space.
 
+- dma-can-stall: When present, the master can wait for a transaction to
+  complete for an indefinite amount of time. Upon translation fault some
+  IOMMUs, instead of aborting the translation immediately, may first
+  notify the driver and keep the transaction in flight. This allows the OS
+  to inspect the fault and, for example, make physical pages resident
+  before updating the mappings and completing the transaction. Such IOMMU
+  accepts a limited number of simultaneous stalled transactions before
+  having to either put back-pressure on the master, or abort new faulting
+  transactions.
+
+  Firmware has to opt-in stalling, because most buses and masters don't
+  support it. In particular it isn't compatible with PCI, where
+  transactions have to complete before a time limit. More generally it
+  won't work in systems and masters that haven't been designed for
+  stalling. For example the OS, in order to handle a stalled transaction,
+  may attempt to retrieve pages from secondary storage in a stalled
+  domain, leading to a deadlock.
+
 
 Notes:
 ==
-- 
2.26.0

___
iommu mailing list
iommu@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/iommu


[PATCH v5 18/25] iommu/arm-smmu-v3: Hook up ATC invalidation to mm ops

2020-04-14 Thread Jean-Philippe Brucker
iommu-sva calls us when an mm is modified. Perform the required ATC
invalidations.

Signed-off-by: Jean-Philippe Brucker 
---
v4->v5: more comments
---
 drivers/iommu/arm-smmu-v3.c | 70 ++---
 1 file changed, 58 insertions(+), 12 deletions(-)

diff --git a/drivers/iommu/arm-smmu-v3.c b/drivers/iommu/arm-smmu-v3.c
index 6640c2ac2a7c5..c4bffb14461aa 100644
--- a/drivers/iommu/arm-smmu-v3.c
+++ b/drivers/iommu/arm-smmu-v3.c
@@ -2375,6 +2375,20 @@ arm_smmu_atc_inv_to_cmd(int ssid, unsigned long iova, 
size_t size,
size_t inval_grain_shift = 12;
unsigned long page_start, page_end;
 
+   /*
+* ATS and PASID:
+*
+* If substream_valid is clear, the PCIe TLP is sent without a PASID
+* prefix. In that case all ATC entries within the address range are
+* invalidated, including those that were requested with a PASID! There
+* is no way to invalidate only entries without PASID.
+*
+* When using STRTAB_STE_1_S1DSS_SSID0 (reserving CD 0 for non-PASID
+* traffic), translation requests without PASID create ATC entries
+* without PASID, which must be invalidated with substream_valid clear.
+* This has the unpleasant side-effect of invalidating all PASID-tagged
+* ATC entries within the address range.
+*/
*cmd = (struct arm_smmu_cmdq_ent) {
.opcode = CMDQ_OP_ATC_INV,
.substream_valid= !!ssid,
@@ -2418,12 +2432,12 @@ arm_smmu_atc_inv_to_cmd(int ssid, unsigned long iova, 
size_t size,
cmd->atc.size   = log2_span;
 }
 
-static int arm_smmu_atc_inv_master(struct arm_smmu_master *master)
+static int arm_smmu_atc_inv_master(struct arm_smmu_master *master, int ssid)
 {
int i;
struct arm_smmu_cmdq_ent cmd;
 
-   arm_smmu_atc_inv_to_cmd(0, 0, 0, );
+   arm_smmu_atc_inv_to_cmd(ssid, 0, 0, );
 
for (i = 0; i < master->num_sids; i++) {
cmd.atc.sid = master->sids[i];
@@ -2934,7 +2948,7 @@ static void arm_smmu_disable_ats(struct arm_smmu_master 
*master)
 * ATC invalidation via the SMMU.
 */
wmb();
-   arm_smmu_atc_inv_master(master);
+   arm_smmu_atc_inv_master(master, 0);
atomic_dec(_domain->nr_ats_masters);
 }
 
@@ -3131,7 +3145,22 @@ arm_smmu_iova_to_phys(struct iommu_domain *domain, 
dma_addr_t iova)
 static void arm_smmu_mm_invalidate(struct device *dev, int pasid, void *entry,
   unsigned long iova, size_t size)
 {
-   /* TODO: Invalidate ATC */
+   int i;
+   struct arm_smmu_cmdq_ent cmd;
+   struct arm_smmu_cmdq_batch cmds = {};
+   struct arm_smmu_master *master = dev_iommu_priv_get(dev);
+
+   if (!master->ats_enabled)
+   return;
+
+   arm_smmu_atc_inv_to_cmd(pasid, iova, size, );
+
+   for (i = 0; i < master->num_sids; i++) {
+   cmd.atc.sid = master->sids[i];
+   arm_smmu_cmdq_batch_add(master->smmu, , );
+   }
+
+   arm_smmu_cmdq_batch_submit(master->smmu, );
 }
 
 static int arm_smmu_mm_attach(struct device *dev, int pasid, void *entry,
@@ -3168,26 +3197,43 @@ static void arm_smmu_mm_clear(struct device *dev, int 
pasid, void *entry)
 * for this ASID, so we need to do it manually.
 */
arm_smmu_tlb_inv_asid(smmu_domain->smmu, cd->asid);
-
-   /* TODO: invalidate ATC */
+   arm_smmu_atc_inv_domain(smmu_domain, pasid, 0, 0);
 }
 
 static void arm_smmu_mm_detach(struct device *dev, int pasid, void *entry,
   bool detach_domain, bool cleared)
 {
struct arm_smmu_ctx_desc *cd = entry;
+   struct arm_smmu_master *master = dev_iommu_priv_get(dev);
struct iommu_domain *domain = iommu_get_domain_for_dev(dev);
struct arm_smmu_domain *smmu_domain = to_smmu_domain(domain);
 
-   if (detach_domain) {
+   if (detach_domain)
arm_smmu_write_ctx_desc(smmu_domain, pasid, NULL);
 
-   if (!cleared)
-   /* See comment in arm_smmu_mm_clear() */
-   arm_smmu_tlb_inv_asid(smmu_domain->smmu, cd->asid);
-   }
+   /*
+* If we went through clear(), we've already invalidated, and no new TLB
+* entry can have been formed.
+*/
+   if (cleared)
+   return;
+
+   if (detach_domain) {
+   /* See comment in arm_smmu_mm_clear() */
+   arm_smmu_tlb_inv_asid(smmu_domain->smmu, cd->asid);
+   arm_smmu_atc_inv_domain(smmu_domain, pasid, 0, 0);
 
-   /* TODO: invalidate ATC */
+   } else if (master->ats_enabled) {
+   /*
+* There are more devices bound with this PASID in this domain,
+* so we cannot yet clear the PASID entry, and this device could
+* create new ATC entries. Invalidate the ATC for the sake of
+* it. On 

[PATCH v5 12/25] iommu/arm-smmu-v3: Share process page tables

2020-04-14 Thread Jean-Philippe Brucker
With Shared Virtual Addressing (SVA), we need to mirror CPU TTBR, TCR,
MAIR and ASIDs in SMMU contexts. Each SMMU has a single ASID space split
into two sets, shared and private. Shared ASIDs correspond to those
obtained from the arch ASID allocator, and private ASIDs are used for
"classic" map/unmap DMA.

Cc: Suzuki K Poulose 
Signed-off-by: Jean-Philippe Brucker 
---
 drivers/iommu/arm-smmu-v3.c | 161 +++-
 1 file changed, 157 insertions(+), 4 deletions(-)

diff --git a/drivers/iommu/arm-smmu-v3.c b/drivers/iommu/arm-smmu-v3.c
index 96ee60002e85e..09f4f712fb103 100644
--- a/drivers/iommu/arm-smmu-v3.c
+++ b/drivers/iommu/arm-smmu-v3.c
@@ -22,6 +22,7 @@
 #include 
 #include 
 #include 
+#include 
 #include 
 #include 
 #include 
@@ -33,6 +34,8 @@
 
 #include 
 
+#include "io-pgtable-arm.h"
+
 /* MMIO registers */
 #define ARM_SMMU_IDR0  0x0
 #define IDR0_ST_LVLGENMASK(28, 27)
@@ -587,6 +590,9 @@ struct arm_smmu_ctx_desc {
u64 ttbr;
u64 tcr;
u64 mair;
+
+   refcount_t  refs;
+   struct mm_struct*mm;
 };
 
 struct arm_smmu_l1_ctx_desc {
@@ -1660,7 +1666,8 @@ static int arm_smmu_write_ctx_desc(struct arm_smmu_domain 
*smmu_domain,
 #ifdef __BIG_ENDIAN
CTXDESC_CD_0_ENDI |
 #endif
-   CTXDESC_CD_0_R | CTXDESC_CD_0_A | CTXDESC_CD_0_ASET |
+   CTXDESC_CD_0_R | CTXDESC_CD_0_A |
+   (cd->mm ? 0 : CTXDESC_CD_0_ASET) |
CTXDESC_CD_0_AA64 |
FIELD_PREP(CTXDESC_CD_0_ASID, cd->asid) |
CTXDESC_CD_0_V;
@@ -1764,12 +1771,156 @@ static void arm_smmu_free_cd_tables(struct 
arm_smmu_domain *smmu_domain)
cdcfg->cdtab = NULL;
 }
 
-static void arm_smmu_free_asid(struct arm_smmu_ctx_desc *cd)
+static void arm_smmu_init_cd(struct arm_smmu_ctx_desc *cd)
 {
+   refcount_set(>refs, 1);
+}
+
+static bool arm_smmu_free_asid(struct arm_smmu_ctx_desc *cd)
+{
+   bool free;
+   struct arm_smmu_ctx_desc *old_cd;
+
if (!cd->asid)
-   return;
+   return false;
+
+   xa_lock(_xa);
+   free = refcount_dec_and_test(>refs);
+   if (free) {
+   old_cd = __xa_erase(_xa, cd->asid);
+   WARN_ON(old_cd != cd);
+   }
+   xa_unlock(_xa);
+   return free;
+}
+
+static struct arm_smmu_ctx_desc *arm_smmu_share_asid(u16 asid)
+{
+   struct arm_smmu_ctx_desc *cd;
+
+   cd = xa_load(_xa, asid);
+   if (!cd)
+   return NULL;
+
+   if (cd->mm) {
+   /*
+* It's pretty common to find a stale CD when doing unbind-bind,
+* given that the release happens after a RCU grace period.
+* arm_smmu_free_asid() hasn't gone through yet, so reuse it.
+*/
+   refcount_inc(>refs);
+   return cd;
+   }
+
+   /*
+* Ouch, ASID is already in use for a private cd.
+* TODO: seize it.
+*/
+   return ERR_PTR(-EEXIST);
+}
+
+__maybe_unused
+static struct arm_smmu_ctx_desc *arm_smmu_alloc_shared_cd(struct mm_struct *mm)
+{
+   u16 asid;
+   int ret = 0;
+   u64 tcr, par, reg;
+   struct arm_smmu_ctx_desc *cd;
+   struct arm_smmu_ctx_desc *old_cd = NULL;
+
+   asid = mm_context_get(mm);
+   if (!asid)
+   return ERR_PTR(-ESRCH);
+
+   cd = kzalloc(sizeof(*cd), GFP_KERNEL);
+   if (!cd) {
+   ret = -ENOMEM;
+   goto err_put_context;
+   }
+
+   arm_smmu_init_cd(cd);
+
+   xa_lock(_xa);
+   old_cd = arm_smmu_share_asid(asid);
+   if (!old_cd) {
+   old_cd = __xa_store(_xa, asid, cd, GFP_ATOMIC);
+   /*
+* Keep error, clear valid pointers. If there was an old entry
+* it has been moved already by arm_smmu_share_asid().
+*/
+   old_cd = ERR_PTR(xa_err(old_cd));
+   cd->asid = asid;
+   }
+   xa_unlock(_xa);
+
+   if (IS_ERR(old_cd)) {
+   ret = PTR_ERR(old_cd);
+   goto err_free_cd;
+   } else if (old_cd) {
+   if (WARN_ON(old_cd->mm != mm)) {
+   ret = -EINVAL;
+   goto err_free_cd;
+   }
+   kfree(cd);
+   mm_context_put(mm);
+   return old_cd;
+   }
+
+   tcr = FIELD_PREP(CTXDESC_CD_0_TCR_T0SZ, 64ULL - VA_BITS) |
+ FIELD_PREP(CTXDESC_CD_0_TCR_IRGN0, ARM_LPAE_TCR_RGN_WBWA) |
+ FIELD_PREP(CTXDESC_CD_0_TCR_ORGN0, ARM_LPAE_TCR_RGN_WBWA) |
+ FIELD_PREP(CTXDESC_CD_0_TCR_SH0, ARM_LPAE_TCR_SH_IS) |
+ CTXDESC_CD_0_TCR_EPD1 | CTXDESC_CD_0_AA64;
+
+   switch (PAGE_SIZE) {
+   case 

[PATCH v5 10/25] iommu/arm-smmu-v3: Manage ASIDs with xarray

2020-04-14 Thread Jean-Philippe Brucker
In preparation for sharing some ASIDs with the CPU, use a global xarray to
store ASIDs and their context. ASID#0 is now reserved, and the ASID
space is global.

Signed-off-by: Jean-Philippe Brucker 
---
 drivers/iommu/arm-smmu-v3.c | 27 ++-
 1 file changed, 18 insertions(+), 9 deletions(-)

diff --git a/drivers/iommu/arm-smmu-v3.c b/drivers/iommu/arm-smmu-v3.c
index 60a415e8e2b6f..96ee60002e85e 100644
--- a/drivers/iommu/arm-smmu-v3.c
+++ b/drivers/iommu/arm-smmu-v3.c
@@ -664,7 +664,6 @@ struct arm_smmu_device {
 
 #define ARM_SMMU_MAX_ASIDS (1 << 16)
unsigned intasid_bits;
-   DECLARE_BITMAP(asid_map, ARM_SMMU_MAX_ASIDS);
 
 #define ARM_SMMU_MAX_VMIDS (1 << 16)
unsigned intvmid_bits;
@@ -724,6 +723,8 @@ struct arm_smmu_option_prop {
const char *prop;
 };
 
+static DEFINE_XARRAY_ALLOC1(asid_xa);
+
 static struct arm_smmu_option_prop arm_smmu_options[] = {
{ ARM_SMMU_OPT_SKIP_PREFETCH, "hisilicon,broken-prefetch-cmd" },
{ ARM_SMMU_OPT_PAGE0_REGS_ONLY, "cavium,cn9900-broken-page1-regspace"},
@@ -1763,6 +1764,14 @@ static void arm_smmu_free_cd_tables(struct 
arm_smmu_domain *smmu_domain)
cdcfg->cdtab = NULL;
 }
 
+static void arm_smmu_free_asid(struct arm_smmu_ctx_desc *cd)
+{
+   if (!cd->asid)
+   return;
+
+   xa_erase(_xa, cd->asid);
+}
+
 /* Stream table manipulation functions */
 static void
 arm_smmu_write_strtab_l1_desc(__le64 *dst, struct arm_smmu_strtab_l1_desc 
*desc)
@@ -2448,10 +2457,9 @@ static void arm_smmu_domain_free(struct iommu_domain 
*domain)
if (smmu_domain->stage == ARM_SMMU_DOMAIN_S1) {
struct arm_smmu_s1_cfg *cfg = _domain->s1_cfg;
 
-   if (cfg->cdcfg.cdtab) {
+   if (cfg->cdcfg.cdtab)
arm_smmu_free_cd_tables(smmu_domain);
-   arm_smmu_bitmap_free(smmu->asid_map, cfg->cd.asid);
-   }
+   arm_smmu_free_asid(>cd);
} else {
struct arm_smmu_s2_cfg *cfg = _domain->s2_cfg;
if (cfg->vmid)
@@ -2466,14 +2474,15 @@ static int arm_smmu_domain_finalise_s1(struct 
arm_smmu_domain *smmu_domain,
   struct io_pgtable_cfg *pgtbl_cfg)
 {
int ret;
-   int asid;
+   u32 asid;
struct arm_smmu_device *smmu = smmu_domain->smmu;
struct arm_smmu_s1_cfg *cfg = _domain->s1_cfg;
typeof(_cfg->arm_lpae_s1_cfg.tcr) tcr = 
_cfg->arm_lpae_s1_cfg.tcr;
 
-   asid = arm_smmu_bitmap_alloc(smmu->asid_map, smmu->asid_bits);
-   if (asid < 0)
-   return asid;
+   ret = xa_alloc(_xa, , >cd,
+  XA_LIMIT(1, (1 << smmu->asid_bits) - 1), GFP_KERNEL);
+   if (ret)
+   return ret;
 
cfg->s1cdmax = master->ssid_bits;
 
@@ -2506,7 +2515,7 @@ static int arm_smmu_domain_finalise_s1(struct 
arm_smmu_domain *smmu_domain,
 out_free_cd_tables:
arm_smmu_free_cd_tables(smmu_domain);
 out_free_asid:
-   arm_smmu_bitmap_free(smmu->asid_map, asid);
+   arm_smmu_free_asid(>cd);
return ret;
 }
 
-- 
2.26.0

___
iommu mailing list
iommu@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/iommu


[PATCH v5 22/25] iommu/arm-smmu-v3: Add stall support for platform devices

2020-04-14 Thread Jean-Philippe Brucker
The SMMU provides a Stall model for handling page faults in platform
devices. It is similar to PCI PRI, but doesn't require devices to have
their own translation cache. Instead, faulting transactions are parked and
the OS is given a chance to fix the page tables and retry the transaction.

Enable stall for devices that support it (opt-in by firmware). When an
event corresponds to a translation error, call the IOMMU fault handler. If
the fault is recoverable, it will call us back to terminate or continue
the stall.

Signed-off-by: Jean-Philippe Brucker 
---
v4->v5: Improve comment for flush()
---
 include/linux/iommu.h   |   2 +
 drivers/iommu/arm-smmu-v3.c | 282 ++--
 drivers/iommu/of_iommu.c|   5 +-
 3 files changed, 277 insertions(+), 12 deletions(-)

diff --git a/include/linux/iommu.h b/include/linux/iommu.h
index 4b9c25d7246d5..7dd615954e8c7 100644
--- a/include/linux/iommu.h
+++ b/include/linux/iommu.h
@@ -578,6 +578,7 @@ struct iommu_group *fsl_mc_device_group(struct device *dev);
  * @iommu_fwnode: firmware handle for this device's IOMMU
  * @iommu_priv: IOMMU driver private data for this device
  * @num_pasid_bits: number of PASID bits supported by this device
+ * @can_stall: the device is allowed to stall
  * @num_ids: number of associated device IDs
  * @ids: IDs which this device may present to the IOMMU
  */
@@ -585,6 +586,7 @@ struct iommu_fwspec {
const struct iommu_ops  *ops;
struct fwnode_handle*iommu_fwnode;
u32 num_pasid_bits;
+   boolcan_stall;
unsigned intnum_ids;
u32 ids[];
 };
diff --git a/drivers/iommu/arm-smmu-v3.c b/drivers/iommu/arm-smmu-v3.c
index 7a4c5914a2fe2..a7becf1c5347e 100644
--- a/drivers/iommu/arm-smmu-v3.c
+++ b/drivers/iommu/arm-smmu-v3.c
@@ -382,6 +382,13 @@
 #define CMDQ_PRI_1_GRPID   GENMASK_ULL(8, 0)
 #define CMDQ_PRI_1_RESPGENMASK_ULL(13, 12)
 
+#define CMDQ_RESUME_0_SID  GENMASK_ULL(63, 32)
+#define CMDQ_RESUME_0_RESP_TERM0UL
+#define CMDQ_RESUME_0_RESP_RETRY   1UL
+#define CMDQ_RESUME_0_RESP_ABORT   2UL
+#define CMDQ_RESUME_0_RESP GENMASK_ULL(13, 12)
+#define CMDQ_RESUME_1_STAG GENMASK_ULL(15, 0)
+
 #define CMDQ_SYNC_0_CS GENMASK_ULL(13, 12)
 #define CMDQ_SYNC_0_CS_NONE0
 #define CMDQ_SYNC_0_CS_IRQ 1
@@ -398,6 +405,25 @@
 
 #define EVTQ_0_ID  GENMASK_ULL(7, 0)
 
+#define EVT_ID_TRANSLATION_FAULT   0x10
+#define EVT_ID_ADDR_SIZE_FAULT 0x11
+#define EVT_ID_ACCESS_FAULT0x12
+#define EVT_ID_PERMISSION_FAULT0x13
+
+#define EVTQ_0_SSV (1UL << 11)
+#define EVTQ_0_SSIDGENMASK_ULL(31, 12)
+#define EVTQ_0_SID GENMASK_ULL(63, 32)
+#define EVTQ_1_STAGGENMASK_ULL(15, 0)
+#define EVTQ_1_STALL   (1UL << 31)
+#define EVTQ_1_PRIV(1UL << 33)
+#define EVTQ_1_EXEC(1UL << 34)
+#define EVTQ_1_READ(1UL << 35)
+#define EVTQ_1_S2  (1UL << 39)
+#define EVTQ_1_CLASS   GENMASK_ULL(41, 40)
+#define EVTQ_1_TT_READ (1UL << 44)
+#define EVTQ_2_ADDRGENMASK_ULL(63, 0)
+#define EVTQ_3_IPA GENMASK_ULL(51, 12)
+
 /* PRI queue */
 #define PRIQ_ENT_SZ_SHIFT  4
 #define PRIQ_ENT_DWORDS((1 << PRIQ_ENT_SZ_SHIFT) >> 3)
@@ -522,6 +548,13 @@ struct arm_smmu_cmdq_ent {
enum pri_resp   resp;
} pri;
 
+   #define CMDQ_OP_RESUME  0x44
+   struct {
+   u32 sid;
+   u16 stag;
+   u8  resp;
+   } resume;
+
#define CMDQ_OP_CMD_SYNC0x46
struct {
u64 msiaddr;
@@ -557,6 +590,10 @@ struct arm_smmu_queue {
 
u32 __iomem *prod_reg;
u32 __iomem *cons_reg;
+
+   /* Event and PRI */
+   u64 batch;
+   wait_queue_head_t   wq;
 };
 
 struct arm_smmu_queue_poll {
@@ -580,6 +617,7 @@ struct arm_smmu_cmdq_batch {
 
 struct arm_smmu_evtq {
struct arm_smmu_queue   q;
+   struct iopf_queue   *iopf;
u32 max_stalls;
 };
 
@@ -717,6 +755,7 @@ struct arm_smmu_master {
struct arm_smmu_stream  *streams;
unsigned intnum_streams;
boolats_enabled;
+   boolstall_enabled;
unsigned intssid_bits;
 };
 
@@ -734,6 

[PATCH v5 13/25] iommu/arm-smmu-v3: Seize private ASID

2020-04-14 Thread Jean-Philippe Brucker
The SMMU has a single ASID space, the union of shared and private ASID
sets. This means that the SMMU driver competes with the arch allocator
for ASIDs. Shared ASIDs are those of Linux processes, allocated by the
arch, and contribute in broadcast TLB maintenance. Private ASIDs are
allocated by the SMMU driver and used for "classic" map/unmap DMA. They
require explicit TLB invalidations.

When we pin down an mm_context and get an ASID that is already in use by
the SMMU, it belongs to a private context. We used to simply abort the
bind, but this is unfair to users that would be unable to bind a few
seemingly random processes. Try to allocate a new private ASID for the
context, and make the old ASID shared.

Introduce a new lock to prevent races when rewriting context
descriptors. Unfortunately it has to be a spinlock since we take it
while holding the asid lock, which will be held in non-sleepable context
(freeing ASIDs from an RCU callback).

Signed-off-by: Jean-Philippe Brucker 
---
 drivers/iommu/arm-smmu-v3.c | 83 +
 1 file changed, 66 insertions(+), 17 deletions(-)

diff --git a/drivers/iommu/arm-smmu-v3.c b/drivers/iommu/arm-smmu-v3.c
index 09f4f712fb103..8fbc5da133ae4 100644
--- a/drivers/iommu/arm-smmu-v3.c
+++ b/drivers/iommu/arm-smmu-v3.c
@@ -730,6 +730,7 @@ struct arm_smmu_option_prop {
 };
 
 static DEFINE_XARRAY_ALLOC1(asid_xa);
+static DEFINE_SPINLOCK(contexts_lock);
 
 static struct arm_smmu_option_prop arm_smmu_options[] = {
{ ARM_SMMU_OPT_SKIP_PREFETCH, "hisilicon,broken-prefetch-cmd" },
@@ -1534,6 +1535,17 @@ static int arm_smmu_cmdq_batch_submit(struct 
arm_smmu_device *smmu,
 }
 
 /* Context descriptor manipulation functions */
+static void arm_smmu_tlb_inv_asid(struct arm_smmu_device *smmu, u16 asid)
+{
+   struct arm_smmu_cmdq_ent cmd = {
+   .opcode = CMDQ_OP_TLBI_NH_ASID,
+   .tlbi.asid = asid,
+   };
+
+   arm_smmu_cmdq_issue_cmd(smmu, );
+   arm_smmu_cmdq_issue_sync(smmu);
+}
+
 static void arm_smmu_sync_cd(struct arm_smmu_domain *smmu_domain,
 int ssid, bool leaf)
 {
@@ -1568,7 +1580,7 @@ static int arm_smmu_alloc_cd_leaf_table(struct 
arm_smmu_device *smmu,
size_t size = CTXDESC_L2_ENTRIES * (CTXDESC_CD_DWORDS << 3);
 
l1_desc->l2ptr = dmam_alloc_coherent(smmu->dev, size,
-_desc->l2ptr_dma, GFP_KERNEL);
+_desc->l2ptr_dma, GFP_ATOMIC);
if (!l1_desc->l2ptr) {
dev_warn(smmu->dev,
 "failed to allocate context descriptor table\n");
@@ -1614,8 +1626,8 @@ static __le64 *arm_smmu_get_cd_ptr(struct arm_smmu_domain 
*smmu_domain,
return l1_desc->l2ptr + idx * CTXDESC_CD_DWORDS;
 }
 
-static int arm_smmu_write_ctx_desc(struct arm_smmu_domain *smmu_domain,
-  int ssid, struct arm_smmu_ctx_desc *cd)
+static int __arm_smmu_write_ctx_desc(struct arm_smmu_domain *smmu_domain,
+int ssid, struct arm_smmu_ctx_desc *cd)
 {
/*
 * This function handles the following cases:
@@ -1691,6 +1703,17 @@ static int arm_smmu_write_ctx_desc(struct 
arm_smmu_domain *smmu_domain,
return 0;
 }
 
+static int arm_smmu_write_ctx_desc(struct arm_smmu_domain *smmu_domain,
+  int ssid, struct arm_smmu_ctx_desc *cd)
+{
+   int ret;
+
+   spin_lock(_lock);
+   ret = __arm_smmu_write_ctx_desc(smmu_domain, ssid, cd);
+   spin_unlock(_lock);
+   return ret;
+}
+
 static int arm_smmu_alloc_cd_tables(struct arm_smmu_domain *smmu_domain)
 {
int ret;
@@ -1794,9 +1817,18 @@ static bool arm_smmu_free_asid(struct arm_smmu_ctx_desc 
*cd)
return free;
 }
 
+/*
+ * Try to reserve this ASID in the SMMU. If it is in use, try to steal it from
+ * the private entry. Careful here, we may be modifying the context tables of
+ * another SMMU!
+ */
 static struct arm_smmu_ctx_desc *arm_smmu_share_asid(u16 asid)
 {
+   int ret;
+   u32 new_asid;
struct arm_smmu_ctx_desc *cd;
+   struct arm_smmu_device *smmu;
+   struct arm_smmu_domain *smmu_domain;
 
cd = xa_load(_xa, asid);
if (!cd)
@@ -1812,11 +1844,31 @@ static struct arm_smmu_ctx_desc 
*arm_smmu_share_asid(u16 asid)
return cd;
}
 
+   smmu_domain = container_of(cd, struct arm_smmu_domain, s1_cfg.cd);
+   smmu = smmu_domain->smmu;
+
+   /*
+* Race with unmap: TLB invalidations will start targeting the new ASID,
+* which isn't assigned yet. We'll do an invalidate-all on the old ASID
+* later, so it doesn't matter.
+*/
+   ret = __xa_alloc(_xa, _asid, cd,
+XA_LIMIT(1, 1 << smmu->asid_bits), GFP_ATOMIC);
+   if (ret)
+   return ERR_PTR(-ENOSPC);
+   cd->asid = new_asid;
+
/*
-* Ouch, ASID is already in 

[PATCH v5 05/25] iommu/iopf: Handle mm faults

2020-04-14 Thread Jean-Philippe Brucker
When a recoverable page fault is handled by the fault workqueue, find the
associated mm and call handle_mm_fault.

Signed-off-by: Jean-Philippe Brucker 
---
v4->v5: no need to call mmput_async() anymore, since the MMU release()
doesn't flush the IOPF queue anymore.
---
 drivers/iommu/io-pgfault.c | 77 +-
 1 file changed, 75 insertions(+), 2 deletions(-)

diff --git a/drivers/iommu/io-pgfault.c b/drivers/iommu/io-pgfault.c
index 5bba8e6a13be2..fd4244023b33f 100644
--- a/drivers/iommu/io-pgfault.c
+++ b/drivers/iommu/io-pgfault.c
@@ -7,6 +7,7 @@
 
 #include 
 #include 
+#include 
 #include 
 #include 
 
@@ -76,8 +77,57 @@ static int iopf_complete_group(struct device *dev, struct 
iopf_fault *iopf,
 static enum iommu_page_response_code
 iopf_handle_single(struct iopf_fault *iopf)
 {
-   /* TODO */
-   return -ENODEV;
+   vm_fault_t ret;
+   struct mm_struct *mm;
+   struct vm_area_struct *vma;
+   unsigned int access_flags = 0;
+   unsigned int fault_flags = FAULT_FLAG_REMOTE;
+   struct iommu_fault_page_request *prm = >fault.prm;
+   enum iommu_page_response_code status = IOMMU_PAGE_RESP_INVALID;
+
+   if (!(prm->flags & IOMMU_FAULT_PAGE_REQUEST_PASID_VALID))
+   return status;
+
+   mm = iommu_sva_find(prm->pasid);
+   if (IS_ERR_OR_NULL(mm))
+   return status;
+
+   down_read(>mmap_sem);
+
+   vma = find_extend_vma(mm, prm->addr);
+   if (!vma)
+   /* Unmapped area */
+   goto out_put_mm;
+
+   if (prm->perm & IOMMU_FAULT_PERM_READ)
+   access_flags |= VM_READ;
+
+   if (prm->perm & IOMMU_FAULT_PERM_WRITE) {
+   access_flags |= VM_WRITE;
+   fault_flags |= FAULT_FLAG_WRITE;
+   }
+
+   if (prm->perm & IOMMU_FAULT_PERM_EXEC) {
+   access_flags |= VM_EXEC;
+   fault_flags |= FAULT_FLAG_INSTRUCTION;
+   }
+
+   if (!(prm->perm & IOMMU_FAULT_PERM_PRIV))
+   fault_flags |= FAULT_FLAG_USER;
+
+   if (access_flags & ~vma->vm_flags)
+   /* Access fault */
+   goto out_put_mm;
+
+   ret = handle_mm_fault(vma, prm->addr, fault_flags);
+   status = ret & VM_FAULT_ERROR ? IOMMU_PAGE_RESP_INVALID :
+   IOMMU_PAGE_RESP_SUCCESS;
+
+out_put_mm:
+   up_read(>mmap_sem);
+   mmput(mm);
+
+   return status;
 }
 
 static void iopf_handle_group(struct work_struct *work)
@@ -112,6 +162,29 @@ static void iopf_handle_group(struct work_struct *work)
  *
  * Add a fault to the device workqueue, to be handled by mm.
  *
+ * This module doesn't handle PCI PASID Stop Marker; IOMMU drivers must discard
+ * them before reporting faults. A PASID Stop Marker (LRW = 0b100) doesn't
+ * expect a response. It may be generated when disabling a PASID (issuing a
+ * PASID stop request) by some PCI devices.
+ *
+ * The PASID stop request is issued by the device driver before unbind(). Once
+ * it completes, no page request is generated for this PASID anymore and
+ * outstanding ones have been pushed to the IOMMU (as per PCIe 4.0r1.0 - 6.20.1
+ * and 10.4.1.2 - Managing PASID TLP Prefix Usage). Some PCI devices will wait
+ * for all outstanding page requests to come back with a response before
+ * completing the PASID stop request. Others do not wait for page responses, 
and
+ * instead issue this Stop Marker that tells us when the PASID can be
+ * reallocated.
+ *
+ * It is safe to discard the Stop Marker because it is an optimization.
+ * a. Page requests, which are posted requests, have been flushed to the IOMMU
+ *when the stop request completes.
+ * b. We flush all fault queues on unbind() before freeing the PASID.
+ *
+ * So even though the Stop Marker might be issued by the device *after* the 
stop
+ * request completes, outstanding faults will have been dealt with by the time
+ * we free the PASID.
+ *
  * Return: 0 on success and <0 on error.
  */
 int iommu_queue_iopf(struct iommu_fault *fault, void *cookie)
-- 
2.26.0

___
iommu mailing list
iommu@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/iommu


[PATCH v5 02/25] iommu/sva: Manage process address spaces

2020-04-14 Thread Jean-Philippe Brucker
Add a small library to help IOMMU drivers manage process address spaces
bound to their devices. Register an MMU notifier to track modification
on each address space bound to one or more devices.

IOMMU drivers must implement the io_mm_ops and can then use the helpers
provided by this library to easily implement the SVA API introduced by
commit 26b25a2b98e4 ("iommu: Bind process address spaces to devices").
The io_mm_ops are:

alloc: Allocate a PASID context private to the IOMMU driver. There is a
  single context per mm. IOMMU drivers may perform arch-specific
  operations in there, for example pinning down a CPU ASID (on Arm).

attach: Attach a context to the device, by setting up the PASID table
  entry.

invalidate: Invalidate TLB entries for this address range.

clear: Clear the context and invalidate IOTLBs. Called if the mm exits
  before unbind(). DMA may still be issued.

detach: Detach a context from the device. Unlike clear() this is always
  called, at unbind(), and DMA aren't issued anymore.

free: Free a context.

Signed-off-by: Jean-Philippe Brucker 
---
v4->v5:
* Simplify locking
* Add clear() op
* Improve doc
---
 drivers/iommu/Kconfig |   7 +
 drivers/iommu/Makefile|   1 +
 drivers/iommu/iommu-sva.h |  78 ++
 include/linux/iommu.h |   4 +
 drivers/iommu/iommu-sva.c | 527 ++
 drivers/iommu/iommu.c |   1 +
 6 files changed, 618 insertions(+)
 create mode 100644 drivers/iommu/iommu-sva.h
 create mode 100644 drivers/iommu/iommu-sva.c

diff --git a/drivers/iommu/Kconfig b/drivers/iommu/Kconfig
index 58b4a4dbfc78b..e81842f59b037 100644
--- a/drivers/iommu/Kconfig
+++ b/drivers/iommu/Kconfig
@@ -102,6 +102,13 @@ config IOMMU_DMA
select IRQ_MSI_IOMMU
select NEED_SG_DMA_LENGTH
 
+# Shared Virtual Addressing library
+config IOMMU_SVA
+   bool
+   select IOASID
+   select IOMMU_API
+   select MMU_NOTIFIER
+
 config FSL_PAMU
bool "Freescale IOMMU support"
depends on PCI
diff --git a/drivers/iommu/Makefile b/drivers/iommu/Makefile
index 9f33fdb3bb051..40c800dd4e3ef 100644
--- a/drivers/iommu/Makefile
+++ b/drivers/iommu/Makefile
@@ -37,3 +37,4 @@ obj-$(CONFIG_S390_IOMMU) += s390-iommu.o
 obj-$(CONFIG_QCOM_IOMMU) += qcom_iommu.o
 obj-$(CONFIG_HYPERV_IOMMU) += hyperv-iommu.o
 obj-$(CONFIG_VIRTIO_IOMMU) += virtio-iommu.o
+obj-$(CONFIG_IOMMU_SVA) += iommu-sva.o
diff --git a/drivers/iommu/iommu-sva.h b/drivers/iommu/iommu-sva.h
new file mode 100644
index 0..3c4c7e886a6be
--- /dev/null
+++ b/drivers/iommu/iommu-sva.h
@@ -0,0 +1,78 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/*
+ * SVA library for IOMMU drivers
+ */
+#ifndef _IOMMU_SVA_H
+#define _IOMMU_SVA_H
+
+#include 
+#include 
+#include 
+
+struct io_mm_ops {
+   /* Allocate a PASID context for an mm */
+   void *(*alloc)(struct mm_struct *mm);
+
+   /*
+* Attach a PASID context to a device. Write the entry into the PASID
+* table.
+*
+* @attach_domain is true when no other device in the IOMMU domain is
+*   already attached to this context. IOMMU drivers that share the
+*   PASID tables within a domain don't need to write the PASID entry
+*   when @attach_domain is false.
+*/
+   int (*attach)(struct device *dev, int pasid, void *ctx,
+ bool attach_domain);
+
+   /* Invalidate a range of addresses. Cannot sleep. */
+   void (*invalidate)(struct device *dev, int pasid, void *ctx,
+  unsigned long vaddr, size_t size);
+
+   /*
+* Clear a PASID context, invalidate IOTLBs. Called when the address
+* space attached to this context exits. Until detach() is called, the
+* PASID is not freed. The IOMMU driver should expect incoming DMA
+* transactions for this PASID and abort them quietly. The IOMMU driver
+* can still queue incoming page faults for this PASID, they will be
+* silently aborted.
+*/
+   void (*clear)(struct device *dev, int pasid, void *ctx);
+
+   /*
+* Detach a PASID context from a device. Unlike exit() this is final.
+* There are no more incoming DMA transactions, and page faults have
+* been flushed.
+*
+* @detach_domain is true when no other device in the IOMMU domain is
+*   still attached to this context. IOMMU drivers that share the PASID
+*   table within a domain don't need to clear the PASID entry when
+*   @detach_domain is false, only invalidate the caches.
+*
+* @cleared is true if the clear() op has already been called for this
+*   context. In this case there is no need to invalidate IOTLBs
+*/
+   void (*detach)(struct device *dev, int pasid, void *ctx,
+  bool detach_domain, bool cleared);
+
+   /* Free a context. Cannot sleep. */
+   void (*free)(void *ctx);
+};
+
+struct iommu_sva_param {

[PATCH v5 00/25] iommu: Shared Virtual Addressing and SMMUv3 support

2020-04-14 Thread Jean-Philippe Brucker
Shared Virtual Addressing (SVA) allows to share process page tables with
devices using the IOMMU. Add a generic implementation of the IOMMU SVA
API, and add support in the Arm SMMUv3 driver.

Since v4 [1] I changed the PASID lifetime. It isn't released when the
corresponding process address space dies, but when the device driver calls
unbind. This alleviates the mmput() path as we don't need to ensure that
the device driver stops DMA there anymore. For more details see my
proposal from last week [2], which is a requirement for this series. As a
result patch 1 has separate clear() and detach() operations, and patch 17
has a new context descriptor state. 

Other changes are a simplification of the locking in patch 1 and overall
cleanups following review comments.

[1] [PATCH v4 00/26] iommu: Shared Virtual Addressing and SMMUv3 support

https://lore.kernel.org/linux-iommu/20200224182401.353359-1-jean-phili...@linaro.org/
[2] [PATCH 0/2] iommu: Remove iommu_sva_ops::mm_exit()

https://lore.kernel.org/linux-iommu/20200408140427.212807-1-jean-phili...@linaro.org/

Jean-Philippe Brucker (25):
  mm/mmu_notifiers: pass private data down to alloc_notifier()
  iommu/sva: Manage process address spaces
  iommu: Add a page fault handler
  iommu/sva: Search mm by PASID
  iommu/iopf: Handle mm faults
  iommu/sva: Register page fault handler
  arm64: mm: Add asid_gen_match() helper
  arm64: mm: Pin down ASIDs for sharing mm with devices
  iommu/io-pgtable-arm: Move some definitions to a header
  iommu/arm-smmu-v3: Manage ASIDs with xarray
  arm64: cpufeature: Export symbol read_sanitised_ftr_reg()
  iommu/arm-smmu-v3: Share process page tables
  iommu/arm-smmu-v3: Seize private ASID
  iommu/arm-smmu-v3: Add support for VHE
  iommu/arm-smmu-v3: Enable broadcast TLB maintenance
  iommu/arm-smmu-v3: Add SVA feature checking
  iommu/arm-smmu-v3: Implement mm operations
  iommu/arm-smmu-v3: Hook up ATC invalidation to mm ops
  iommu/arm-smmu-v3: Add support for Hardware Translation Table Update
  iommu/arm-smmu-v3: Maintain a SID->device structure
  dt-bindings: document stall property for IOMMU masters
  iommu/arm-smmu-v3: Add stall support for platform devices
  PCI/ATS: Add PRI stubs
  PCI/ATS: Export PRI functions
  iommu/arm-smmu-v3: Add support for PRI

 drivers/iommu/Kconfig |   13 +
 drivers/iommu/Makefile|2 +
 .../devicetree/bindings/iommu/iommu.txt   |   18 +
 arch/arm64/include/asm/mmu.h  |1 +
 arch/arm64/include/asm/mmu_context.h  |   11 +-
 drivers/iommu/io-pgtable-arm.h|   30 +
 drivers/iommu/iommu-sva.h |   78 +
 include/linux/iommu.h |   75 +
 include/linux/mmu_notifier.h  |   11 +-
 include/linux/pci-ats.h   |8 +
 arch/arm64/kernel/cpufeature.c|1 +
 arch/arm64/mm/context.c   |  103 +-
 drivers/iommu/arm-smmu-v3.c   | 1398 +++--
 drivers/iommu/io-pgfault.c|  525 +++
 drivers/iommu/io-pgtable-arm.c|   27 +-
 drivers/iommu/iommu-sva.c |  557 +++
 drivers/iommu/iommu.c |1 +
 drivers/iommu/of_iommu.c  |5 +-
 drivers/misc/sgi-gru/grutlbpurge.c|5 +-
 drivers/pci/ats.c |4 +
 mm/mmu_notifier.c |6 +-
 21 files changed, 2716 insertions(+), 163 deletions(-)
 create mode 100644 drivers/iommu/io-pgtable-arm.h
 create mode 100644 drivers/iommu/iommu-sva.h
 create mode 100644 drivers/iommu/io-pgfault.c
 create mode 100644 drivers/iommu/iommu-sva.c

-- 
2.26.0

___
iommu mailing list
iommu@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/iommu


[PATCH v5 25/25] iommu/arm-smmu-v3: Add support for PRI

2020-04-14 Thread Jean-Philippe Brucker
For PCI devices that support it, enable the PRI capability and handle PRI
Page Requests with the generic fault handler. It is enabled on demand by
iommu_sva_device_init().

Signed-off-by: Jean-Philippe Brucker 
---
 drivers/iommu/arm-smmu-v3.c | 284 +---
 1 file changed, 234 insertions(+), 50 deletions(-)

diff --git a/drivers/iommu/arm-smmu-v3.c b/drivers/iommu/arm-smmu-v3.c
index a7becf1c5347e..8017700c33c46 100644
--- a/drivers/iommu/arm-smmu-v3.c
+++ b/drivers/iommu/arm-smmu-v3.c
@@ -251,6 +251,7 @@
 #define STRTAB_STE_1_S1COR GENMASK_ULL(5, 4)
 #define STRTAB_STE_1_S1CSH GENMASK_ULL(7, 6)
 
+#define STRTAB_STE_1_PPAR  (1UL << 18)
 #define STRTAB_STE_1_S1STALLD  (1UL << 27)
 
 #define STRTAB_STE_1_EATS  GENMASK_ULL(29, 28)
@@ -381,6 +382,9 @@
 #define CMDQ_PRI_0_SID GENMASK_ULL(63, 32)
 #define CMDQ_PRI_1_GRPID   GENMASK_ULL(8, 0)
 #define CMDQ_PRI_1_RESPGENMASK_ULL(13, 12)
+#define CMDQ_PRI_1_RESP_FAILURE0UL
+#define CMDQ_PRI_1_RESP_INVALID1UL
+#define CMDQ_PRI_1_RESP_SUCCESS2UL
 
 #define CMDQ_RESUME_0_SID  GENMASK_ULL(63, 32)
 #define CMDQ_RESUME_0_RESP_TERM0UL
@@ -453,12 +457,6 @@ module_param_named(disable_bypass, disable_bypass, bool, 
S_IRUGO);
 MODULE_PARM_DESC(disable_bypass,
"Disable bypass streams such that incoming transactions from devices 
that are not attached to an iommu domain will report an abort back to the 
device and will not be allowed to pass through the SMMU.");
 
-enum pri_resp {
-   PRI_RESP_DENY = 0,
-   PRI_RESP_FAIL = 1,
-   PRI_RESP_SUCC = 2,
-};
-
 enum arm_smmu_msi_index {
EVTQ_MSI_INDEX,
GERROR_MSI_INDEX,
@@ -545,7 +543,7 @@ struct arm_smmu_cmdq_ent {
u32 sid;
u32 ssid;
u16 grpid;
-   enum pri_resp   resp;
+   u8  resp;
} pri;
 
#define CMDQ_OP_RESUME  0x44
@@ -623,6 +621,7 @@ struct arm_smmu_evtq {
 
 struct arm_smmu_priq {
struct arm_smmu_queue   q;
+   struct iopf_queue   *iopf;
 };
 
 /* High-level stream table and context descriptor structures */
@@ -756,6 +755,8 @@ struct arm_smmu_master {
unsigned intnum_streams;
boolats_enabled;
boolstall_enabled;
+   boolpri_supported;
+   boolprg_resp_needs_ssid;
unsigned intssid_bits;
 };
 
@@ -1034,14 +1035,6 @@ static int arm_smmu_cmdq_build_cmd(u64 *cmd, struct 
arm_smmu_cmdq_ent *ent)
cmd[0] |= FIELD_PREP(CMDQ_PRI_0_SSID, ent->pri.ssid);
cmd[0] |= FIELD_PREP(CMDQ_PRI_0_SID, ent->pri.sid);
cmd[1] |= FIELD_PREP(CMDQ_PRI_1_GRPID, ent->pri.grpid);
-   switch (ent->pri.resp) {
-   case PRI_RESP_DENY:
-   case PRI_RESP_FAIL:
-   case PRI_RESP_SUCC:
-   break;
-   default:
-   return -EINVAL;
-   }
cmd[1] |= FIELD_PREP(CMDQ_PRI_1_RESP, ent->pri.resp);
break;
case CMDQ_OP_RESUME:
@@ -1621,6 +1614,7 @@ static int arm_smmu_page_response(struct device *dev,
 {
struct arm_smmu_cmdq_ent cmd = {0};
struct arm_smmu_master *master = dev_iommu_priv_get(dev);
+   bool pasid_valid = resp->flags & IOMMU_PAGE_RESP_PASID_VALID;
int sid = master->streams[0].id;
 
if (master->stall_enabled) {
@@ -1638,8 +1632,27 @@ static int arm_smmu_page_response(struct device *dev,
default:
return -EINVAL;
}
+   } else if (master->pri_supported) {
+   cmd.opcode  = CMDQ_OP_PRI_RESP;
+   cmd.substream_valid = pasid_valid &&
+ master->prg_resp_needs_ssid;
+   cmd.pri.sid = sid;
+   cmd.pri.ssid= resp->pasid;
+   cmd.pri.grpid   = resp->grpid;
+   switch (resp->code) {
+   case IOMMU_PAGE_RESP_FAILURE:
+   cmd.pri.resp = CMDQ_PRI_1_RESP_FAILURE;
+   break;
+   case IOMMU_PAGE_RESP_INVALID:
+   cmd.pri.resp = CMDQ_PRI_1_RESP_INVALID;
+   break;
+   case IOMMU_PAGE_RESP_SUCCESS:
+   cmd.pri.resp = CMDQ_PRI_1_RESP_SUCCESS;
+   break;
+   default:
+   return -EINVAL;
+   }
} else {
-   /* TODO: 

[PATCH v5 20/25] iommu/arm-smmu-v3: Maintain a SID->device structure

2020-04-14 Thread Jean-Philippe Brucker
When handling faults from the event or PRI queue, we need to find the
struct device associated to a SID. Add a rb_tree to keep track of SIDs.

Signed-off-by: Jean-Philippe Brucker 
---
 drivers/iommu/arm-smmu-v3.c | 179 +---
 1 file changed, 147 insertions(+), 32 deletions(-)

diff --git a/drivers/iommu/arm-smmu-v3.c b/drivers/iommu/arm-smmu-v3.c
index 4ed9df15581af..7a4c5914a2fe2 100644
--- a/drivers/iommu/arm-smmu-v3.c
+++ b/drivers/iommu/arm-smmu-v3.c
@@ -697,6 +697,15 @@ struct arm_smmu_device {
 
/* IOMMU core code handle */
struct iommu_device iommu;
+
+   struct rb_root  streams;
+   struct mutexstreams_mutex;
+};
+
+struct arm_smmu_stream {
+   u32 id;
+   struct arm_smmu_master  *master;
+   struct rb_node  node;
 };
 
 /* SMMU private data for each master */
@@ -705,8 +714,8 @@ struct arm_smmu_master {
struct device   *dev;
struct arm_smmu_domain  *domain;
struct list_headdomain_head;
-   u32 *sids;
-   unsigned intnum_sids;
+   struct arm_smmu_stream  *streams;
+   unsigned intnum_streams;
boolats_enabled;
unsigned intssid_bits;
 };
@@ -1592,8 +1601,8 @@ static void arm_smmu_sync_cd(struct arm_smmu_domain 
*smmu_domain,
 
spin_lock_irqsave(_domain->devices_lock, flags);
list_for_each_entry(master, _domain->devices, domain_head) {
-   for (i = 0; i < master->num_sids; i++) {
-   cmd.cfgi.sid = master->sids[i];
+   for (i = 0; i < master->num_streams; i++) {
+   cmd.cfgi.sid = master->streams[i].id;
arm_smmu_cmdq_batch_add(smmu, , );
}
}
@@ -,6 +2231,32 @@ static int arm_smmu_init_l2_strtab(struct 
arm_smmu_device *smmu, u32 sid)
return 0;
 }
 
+__maybe_unused
+static struct arm_smmu_master *
+arm_smmu_find_master(struct arm_smmu_device *smmu, u32 sid)
+{
+   struct rb_node *node;
+   struct arm_smmu_stream *stream;
+   struct arm_smmu_master *master = NULL;
+
+   mutex_lock(>streams_mutex);
+   node = smmu->streams.rb_node;
+   while (node) {
+   stream = rb_entry(node, struct arm_smmu_stream, node);
+   if (stream->id < sid) {
+   node = node->rb_right;
+   } else if (stream->id > sid) {
+   node = node->rb_left;
+   } else {
+   master = stream->master;
+   break;
+   }
+   }
+   mutex_unlock(>streams_mutex);
+
+   return master;
+}
+
 /* IRQ and event handlers */
 static irqreturn_t arm_smmu_evtq_thread(int irq, void *dev)
 {
@@ -2455,8 +2490,8 @@ static int arm_smmu_atc_inv_master(struct arm_smmu_master 
*master, int ssid)
 
arm_smmu_atc_inv_to_cmd(ssid, 0, 0, );
 
-   for (i = 0; i < master->num_sids; i++) {
-   cmd.atc.sid = master->sids[i];
+   for (i = 0; i < master->num_streams; i++) {
+   cmd.atc.sid = master->streams[i].id;
arm_smmu_cmdq_issue_cmd(master->smmu, );
}
 
@@ -2499,8 +2534,8 @@ static int arm_smmu_atc_inv_domain(struct arm_smmu_domain 
*smmu_domain,
if (!master->ats_enabled)
continue;
 
-   for (i = 0; i < master->num_sids; i++) {
-   cmd.atc.sid = master->sids[i];
+   for (i = 0; i < master->num_streams; i++) {
+   cmd.atc.sid = master->streams[i].id;
arm_smmu_cmdq_batch_add(smmu_domain->smmu, , );
}
}
@@ -2906,13 +2941,13 @@ static void arm_smmu_install_ste_for_dev(struct 
arm_smmu_master *master)
int i, j;
struct arm_smmu_device *smmu = master->smmu;
 
-   for (i = 0; i < master->num_sids; ++i) {
-   u32 sid = master->sids[i];
+   for (i = 0; i < master->num_streams; ++i) {
+   u32 sid = master->streams[i].id;
__le64 *step = arm_smmu_get_step_for_sid(smmu, sid);
 
/* Bridged PCI devices may end up with duplicated IDs */
for (j = 0; j < i; j++)
-   if (master->sids[j] == sid)
+   if (master->streams[j].id == sid)
break;
if (j < i)
continue;
@@ -3171,8 +3206,8 @@ static void arm_smmu_mm_invalidate(struct device *dev, 
int pasid, void *entry,
 
arm_smmu_atc_inv_to_cmd(pasid, iova, size, );
 
-   for (i = 0; i < master->num_sids; i++) {
-   cmd.atc.sid = master->sids[i];
+   for (i = 0; i < master->num_streams; i++) {
+   

[PATCH v5 04/25] iommu/sva: Search mm by PASID

2020-04-14 Thread Jean-Philippe Brucker
The fault handler will need to find an mm given its PASID. This is the
reason we have an IDR for storing address spaces, so hook it up.

Signed-off-by: Jean-Philippe Brucker 
---
 include/linux/iommu.h |  9 +
 drivers/iommu/iommu-sva.c | 19 +++
 2 files changed, 28 insertions(+)

diff --git a/include/linux/iommu.h b/include/linux/iommu.h
index 5a3d092c2568a..4b9c25d7246d5 100644
--- a/include/linux/iommu.h
+++ b/include/linux/iommu.h
@@ -1081,6 +1081,15 @@ void iommu_debugfs_setup(void);
 static inline void iommu_debugfs_setup(void) {}
 #endif
 
+#ifdef CONFIG_IOMMU_SVA
+extern struct mm_struct *iommu_sva_find(int pasid);
+#else /* !CONFIG_IOMMU_SVA */
+static inline struct mm_struct *iommu_sva_find(int pasid)
+{
+   return NULL;
+}
+#endif /* !CONFIG_IOMMU_SVA */
+
 #ifdef CONFIG_IOMMU_PAGE_FAULT
 extern int iommu_queue_iopf(struct iommu_fault *fault, void *cookie);
 
diff --git a/drivers/iommu/iommu-sva.c b/drivers/iommu/iommu-sva.c
index 7fecc74a9f7d6..b177d6cbf4fff 100644
--- a/drivers/iommu/iommu-sva.c
+++ b/drivers/iommu/iommu-sva.c
@@ -525,3 +525,22 @@ int iommu_sva_get_pasid_generic(struct iommu_sva *handle)
return bond->io_mm->pasid;
 }
 EXPORT_SYMBOL_GPL(iommu_sva_get_pasid_generic);
+
+/* ioasid wants a void * argument */
+static bool __mmget_not_zero(void *mm)
+{
+   return mmget_not_zero(mm);
+}
+
+/**
+ * iommu_sva_find() - Find mm associated to the given PASID
+ * @pasid: Process Address Space ID assigned to the mm
+ *
+ * Returns the mm corresponding to this PASID, or an error if not found. A
+ * reference to the mm is taken, and must be released with mmput().
+ */
+struct mm_struct *iommu_sva_find(int pasid)
+{
+   return ioasid_find(_pasid, pasid, __mmget_not_zero);
+}
+EXPORT_SYMBOL_GPL(iommu_sva_find);
-- 
2.26.0

___
iommu mailing list
iommu@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/iommu


[PATCH v5 23/25] PCI/ATS: Add PRI stubs

2020-04-14 Thread Jean-Philippe Brucker
The SMMUv3 driver, which can be built without CONFIG_PCI, will soon gain
support for PRI.  Partially revert commit c6e9aefbf9db ("PCI/ATS: Remove
unused PRI and PASID stubs") to re-introduce the PRI stubs, and avoid
adding more #ifdefs to the SMMU driver.

Acked-by: Bjorn Helgaas 
Signed-off-by: Jean-Philippe Brucker 
---
 include/linux/pci-ats.h | 8 
 1 file changed, 8 insertions(+)

diff --git a/include/linux/pci-ats.h b/include/linux/pci-ats.h
index f75c307f346de..e9e266df9b37c 100644
--- a/include/linux/pci-ats.h
+++ b/include/linux/pci-ats.h
@@ -28,6 +28,14 @@ int pci_enable_pri(struct pci_dev *pdev, u32 reqs);
 void pci_disable_pri(struct pci_dev *pdev);
 int pci_reset_pri(struct pci_dev *pdev);
 int pci_prg_resp_pasid_required(struct pci_dev *pdev);
+#else /* CONFIG_PCI_PRI */
+static inline int pci_enable_pri(struct pci_dev *pdev, u32 reqs)
+{ return -ENODEV; }
+static inline void pci_disable_pri(struct pci_dev *pdev) { }
+static inline int pci_reset_pri(struct pci_dev *pdev)
+{ return -ENODEV; }
+static inline int pci_prg_resp_pasid_required(struct pci_dev *pdev)
+{ return 0; }
 #endif /* CONFIG_PCI_PRI */
 
 #ifdef CONFIG_PCI_PASID
-- 
2.26.0

___
iommu mailing list
iommu@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/iommu


[PATCH v5 19/25] iommu/arm-smmu-v3: Add support for Hardware Translation Table Update

2020-04-14 Thread Jean-Philippe Brucker
If the SMMU supports it and the kernel was built with HTTU support, enable
hardware update of access and dirty flags. This is essential for shared
page tables, to reduce the number of access faults on the fault queue.

We can enable HTTU even if CPUs don't support it, because the kernel
always checks for HW dirty bit and updates the PTE flags atomically.

Signed-off-by: Jean-Philippe Brucker 
---
v4->v5: bump feature bits
---
 drivers/iommu/arm-smmu-v3.c | 24 +++-
 1 file changed, 23 insertions(+), 1 deletion(-)

diff --git a/drivers/iommu/arm-smmu-v3.c b/drivers/iommu/arm-smmu-v3.c
index c4bffb14461aa..4ed9df15581af 100644
--- a/drivers/iommu/arm-smmu-v3.c
+++ b/drivers/iommu/arm-smmu-v3.c
@@ -57,6 +57,8 @@
 #define IDR0_ASID16(1 << 12)
 #define IDR0_ATS   (1 << 10)
 #define IDR0_HYP   (1 << 9)
+#define IDR0_HD(1 << 7)
+#define IDR0_HA(1 << 6)
 #define IDR0_BTM   (1 << 5)
 #define IDR0_COHACC(1 << 4)
 #define IDR0_TTF   GENMASK(3, 2)
@@ -308,6 +310,9 @@
 #define CTXDESC_CD_0_TCR_IPS   GENMASK_ULL(34, 32)
 #define CTXDESC_CD_0_TCR_TBI0  (1ULL << 38)
 
+#define CTXDESC_CD_0_TCR_HA(1UL << 43)
+#define CTXDESC_CD_0_TCR_HD(1UL << 42)
+
 #define CTXDESC_CD_0_AA64  (1UL << 41)
 #define CTXDESC_CD_0_S (1UL << 44)
 #define CTXDESC_CD_0_R (1UL << 45)
@@ -659,6 +664,8 @@ struct arm_smmu_device {
 #define ARM_SMMU_FEAT_E2H  (1 << 16)
 #define ARM_SMMU_FEAT_BTM  (1 << 17)
 #define ARM_SMMU_FEAT_SVA  (1 << 18)
+#define ARM_SMMU_FEAT_HA   (1 << 19)
+#define ARM_SMMU_FEAT_HD   (1 << 20)
u32 features;
 
 #define ARM_SMMU_OPT_SKIP_PREFETCH (1 << 0)
@@ -1689,10 +1696,17 @@ static int __arm_smmu_write_ctx_desc(struct 
arm_smmu_domain *smmu_domain,
 * this substream's traffic
 */
} else { /* (1) and (2) */
+   u64 tcr = cd->tcr;
+
cdptr[1] = cpu_to_le64(cd->ttbr & CTXDESC_CD_1_TTB0_MASK);
cdptr[2] = 0;
cdptr[3] = cpu_to_le64(cd->mair);
 
+   if (!(smmu->features & ARM_SMMU_FEAT_HD))
+   tcr &= ~CTXDESC_CD_0_TCR_HD;
+   if (!(smmu->features & ARM_SMMU_FEAT_HA))
+   tcr &= ~CTXDESC_CD_0_TCR_HA;
+
/*
 * STE is live, and the SMMU might read dwords of this CD in any
 * order. Ensure that it observes valid values before reading
@@ -1700,7 +1714,7 @@ static int __arm_smmu_write_ctx_desc(struct 
arm_smmu_domain *smmu_domain,
 */
arm_smmu_sync_cd(smmu_domain, ssid, true);
 
-   val = cd->tcr |
+   val = tcr |
 #ifdef __BIG_ENDIAN
CTXDESC_CD_0_ENDI |
 #endif
@@ -1943,10 +1957,12 @@ static struct arm_smmu_ctx_desc 
*arm_smmu_alloc_shared_cd(struct mm_struct *mm)
return old_cd;
}
 
+   /* HA and HD will be filtered out later if not supported by the SMMU */
tcr = FIELD_PREP(CTXDESC_CD_0_TCR_T0SZ, 64ULL - VA_BITS) |
  FIELD_PREP(CTXDESC_CD_0_TCR_IRGN0, ARM_LPAE_TCR_RGN_WBWA) |
  FIELD_PREP(CTXDESC_CD_0_TCR_ORGN0, ARM_LPAE_TCR_RGN_WBWA) |
  FIELD_PREP(CTXDESC_CD_0_TCR_SH0, ARM_LPAE_TCR_SH_IS) |
+ CTXDESC_CD_0_TCR_HA | CTXDESC_CD_0_TCR_HD |
  CTXDESC_CD_0_TCR_EPD1 | CTXDESC_CD_0_AA64;
 
switch (PAGE_SIZE) {
@@ -4309,6 +4325,12 @@ static int arm_smmu_device_hw_probe(struct 
arm_smmu_device *smmu)
smmu->features |= ARM_SMMU_FEAT_E2H;
}
 
+   if (reg & (IDR0_HA | IDR0_HD)) {
+   smmu->features |= ARM_SMMU_FEAT_HA;
+   if (reg & IDR0_HD)
+   smmu->features |= ARM_SMMU_FEAT_HD;
+   }
+
/*
 * If the CPU is using VHE, but the SMMU doesn't support it, the SMMU
 * will create TLB entries for NH-EL1 world and will miss the
-- 
2.26.0

___
iommu mailing list
iommu@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/iommu


[PATCH v5 01/25] mm/mmu_notifiers: pass private data down to alloc_notifier()

2020-04-14 Thread Jean-Philippe Brucker
The new allocation scheme introduced by commit 2c7933f53f6b
("mm/mmu_notifiers: add a get/put scheme for the registration") provides
a convenient way for users to attach notifier data to an mm. However, it
would be even better to create this notifier data atomically.

Since the alloc_notifier() callback only takes an mm argument at the
moment, some users have to perform the allocation in two times.
alloc_notifier() initially creates an incomplete structure, which is
then finalized using more context once mmu_notifier_get() returns. This
second step requires extra care to order memory accesses against live
invalidation.

The IOMMU SVA module, which attaches an mm to multiple devices,
exemplifies this situation. In essence it does:

mmu_notifier_get()
  alloc_notifier()
 A = kzalloc()
  /* MMU notifier is published */
A->ctx = ctx;   // (1)
device->A = A;
list_add_rcu(device, A->devices);   // (2)

The invalidate notifier, which may start running before A is fully
initialized, does the following:

io_mm_invalidate(A)
  list_for_each_entry_rcu(device, A->devices)
device->invalidate(A->ctx)

The invalidate() thread must observe the initialization (1) before (2),
which is easily solved by fully initializing object A in
alloc_notifier(), before publishing the MMU notifier.

Cc: Andrew Morton 
Cc: Arnd Bergmann 
Cc: Christoph Hellwig 
Cc: Dimitri Sivanich 
Cc: Greg Kroah-Hartman 
Cc: Jason Gunthorpe 
Signed-off-by: Jean-Philippe Brucker 
---
v4->v5: provide example in commit message, fix style.
---
 include/linux/mmu_notifier.h   | 11 +++
 drivers/misc/sgi-gru/grutlbpurge.c |  5 +++--
 mm/mmu_notifier.c  |  6 --
 3 files changed, 14 insertions(+), 8 deletions(-)

diff --git a/include/linux/mmu_notifier.h b/include/linux/mmu_notifier.h
index 736f6918335ed..0536fe85e7457 100644
--- a/include/linux/mmu_notifier.h
+++ b/include/linux/mmu_notifier.h
@@ -207,7 +207,8 @@ struct mmu_notifier_ops {
 * callbacks are currently running. It is called from a SRCU callback
 * and cannot sleep.
 */
-   struct mmu_notifier *(*alloc_notifier)(struct mm_struct *mm);
+   struct mmu_notifier *(*alloc_notifier)(struct mm_struct *mm,
+  void *privdata);
void (*free_notifier)(struct mmu_notifier *subscription);
 };
 
@@ -271,14 +272,16 @@ static inline int mm_has_notifiers(struct mm_struct *mm)
 }
 
 struct mmu_notifier *mmu_notifier_get_locked(const struct mmu_notifier_ops 
*ops,
-struct mm_struct *mm);
+struct mm_struct *mm,
+void *privdata);
 static inline struct mmu_notifier *
-mmu_notifier_get(const struct mmu_notifier_ops *ops, struct mm_struct *mm)
+mmu_notifier_get(const struct mmu_notifier_ops *ops, struct mm_struct *mm,
+void *privdata)
 {
struct mmu_notifier *ret;
 
down_write(>mmap_sem);
-   ret = mmu_notifier_get_locked(ops, mm);
+   ret = mmu_notifier_get_locked(ops, mm, privdata);
up_write(>mmap_sem);
return ret;
 }
diff --git a/drivers/misc/sgi-gru/grutlbpurge.c 
b/drivers/misc/sgi-gru/grutlbpurge.c
index 10921cd2608df..336e1b1df072f 100644
--- a/drivers/misc/sgi-gru/grutlbpurge.c
+++ b/drivers/misc/sgi-gru/grutlbpurge.c
@@ -235,7 +235,8 @@ static void gru_invalidate_range_end(struct mmu_notifier 
*mn,
gms, range->start, range->end);
 }
 
-static struct mmu_notifier *gru_alloc_notifier(struct mm_struct *mm)
+static struct mmu_notifier *gru_alloc_notifier(struct mm_struct *mm,
+  void *privdata)
 {
struct gru_mm_struct *gms;
 
@@ -266,7 +267,7 @@ struct gru_mm_struct *gru_register_mmu_notifier(void)
 {
struct mmu_notifier *mn;
 
-   mn = mmu_notifier_get_locked(_mmuops, current->mm);
+   mn = mmu_notifier_get_locked(_mmuops, current->mm, NULL);
if (IS_ERR(mn))
return ERR_CAST(mn);
 
diff --git a/mm/mmu_notifier.c b/mm/mmu_notifier.c
index 06852b896fa63..6b9bfb8ca94d2 100644
--- a/mm/mmu_notifier.c
+++ b/mm/mmu_notifier.c
@@ -743,6 +743,7 @@ find_get_mmu_notifier(struct mm_struct *mm, const struct 
mmu_notifier_ops *ops)
  *   the mm & ops
  * @ops: The operations struct being subscribe with
  * @mm : The mm to attach notifiers too
+ * @privdata: Initialization data passed down to ops->alloc_notifier()
  *
  * This function either allocates a new mmu_notifier via
  * ops->alloc_notifier(), or returns an already existing notifier on the
@@ -756,7 +757,8 @@ find_get_mmu_notifier(struct mm_struct *mm, const struct 
mmu_notifier_ops *ops)
  * and can be converted to an active mm pointer via mmget_not_zero().
  */
 struct mmu_notifier *mmu_notifier_get_locked(const struct 

[PATCH v5 16/25] iommu/arm-smmu-v3: Add SVA feature checking

2020-04-14 Thread Jean-Philippe Brucker
Aggregate all sanity-checks for sharing CPU page tables with the SMMU
under a single ARM_SMMU_FEAT_SVA bit. For PCIe SVA, users also need to
check FEAT_ATS and FEAT_PRI. For platform SVA, they will most likely have
to check FEAT_STALLS.

Cc: Suzuki K Poulose 
Signed-off-by: Jean-Philippe Brucker 
---
v4->v5: bump feature bit
---
 drivers/iommu/arm-smmu-v3.c | 72 +
 1 file changed, 72 insertions(+)

diff --git a/drivers/iommu/arm-smmu-v3.c b/drivers/iommu/arm-smmu-v3.c
index e7de8a7459fa4..d209d85402a83 100644
--- a/drivers/iommu/arm-smmu-v3.c
+++ b/drivers/iommu/arm-smmu-v3.c
@@ -657,6 +657,7 @@ struct arm_smmu_device {
 #define ARM_SMMU_FEAT_RANGE_INV(1 << 15)
 #define ARM_SMMU_FEAT_E2H  (1 << 16)
 #define ARM_SMMU_FEAT_BTM  (1 << 17)
+#define ARM_SMMU_FEAT_SVA  (1 << 18)
u32 features;
 
 #define ARM_SMMU_OPT_SKIP_PREFETCH (1 << 0)
@@ -3930,6 +3931,74 @@ static int arm_smmu_device_reset(struct arm_smmu_device 
*smmu, bool bypass)
return 0;
 }
 
+static bool arm_smmu_supports_sva(struct arm_smmu_device *smmu)
+{
+   unsigned long reg, fld;
+   unsigned long oas;
+   unsigned long asid_bits;
+
+   u32 feat_mask = ARM_SMMU_FEAT_BTM | ARM_SMMU_FEAT_COHERENCY;
+
+   if ((smmu->features & feat_mask) != feat_mask)
+   return false;
+
+   if (!(smmu->pgsize_bitmap & PAGE_SIZE))
+   return false;
+
+   /*
+* Get the smallest PA size of all CPUs (sanitized by cpufeature). We're
+* not even pretending to support AArch32 here.
+*/
+   reg = read_sanitised_ftr_reg(SYS_ID_AA64MMFR0_EL1);
+   fld = cpuid_feature_extract_unsigned_field(reg, 
ID_AA64MMFR0_PARANGE_SHIFT);
+   switch (fld) {
+   case 0x0:
+   oas = 32;
+   break;
+   case 0x1:
+   oas = 36;
+   break;
+   case 0x2:
+   oas = 40;
+   break;
+   case 0x3:
+   oas = 42;
+   break;
+   case 0x4:
+   oas = 44;
+   break;
+   case 0x5:
+   oas = 48;
+   break;
+   case 0x6:
+   oas = 52;
+   break;
+   default:
+   return false;
+   }
+
+   /* abort if MMU outputs addresses greater than what we support. */
+   if (smmu->oas < oas)
+   return false;
+
+   /* We can support bigger ASIDs than the CPU, but not smaller */
+   fld = cpuid_feature_extract_unsigned_field(reg, 
ID_AA64MMFR0_ASID_SHIFT);
+   asid_bits = fld ? 16 : 8;
+   if (smmu->asid_bits < asid_bits)
+   return false;
+
+   /*
+* See max_pinned_asids in arch/arm64/mm/context.c. The following is
+* generally the maximum number of bindable processes.
+*/
+   if (IS_ENABLED(CONFIG_UNMAP_KERNEL_AT_EL0))
+   asid_bits--;
+   dev_dbg(smmu->dev, "%d shared contexts\n", (1 << asid_bits) -
+   num_possible_cpus() - 2);
+
+   return true;
+}
+
 static int arm_smmu_device_hw_probe(struct arm_smmu_device *smmu)
 {
u32 reg;
@@ -4142,6 +4211,9 @@ static int arm_smmu_device_hw_probe(struct 
arm_smmu_device *smmu)
 
smmu->ias = max(smmu->ias, smmu->oas);
 
+   if (arm_smmu_supports_sva(smmu))
+   smmu->features |= ARM_SMMU_FEAT_SVA;
+
dev_info(smmu->dev, "ias %lu-bit, oas %lu-bit (features 0x%08x)\n",
 smmu->ias, smmu->oas, smmu->features);
return 0;
-- 
2.26.0

___
iommu mailing list
iommu@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/iommu


[PATCH v5 06/25] iommu/sva: Register page fault handler

2020-04-14 Thread Jean-Philippe Brucker
When enabling SVA, register the fault handler. Device driver will register
an I/O page fault queue before or after calling iommu_sva_enable. The
fault queue must be flushed before any io_mm is freed, to make sure that
its PASID isn't used in any fault queue, and can be reallocated.

Signed-off-by: Jean-Philippe Brucker 
---
 drivers/iommu/Kconfig |  1 +
 drivers/iommu/iommu-sva.c | 11 +++
 2 files changed, 12 insertions(+)

diff --git a/drivers/iommu/Kconfig b/drivers/iommu/Kconfig
index bf620bf48da03..411a7ee2ab12d 100644
--- a/drivers/iommu/Kconfig
+++ b/drivers/iommu/Kconfig
@@ -106,6 +106,7 @@ config IOMMU_DMA
 config IOMMU_SVA
bool
select IOASID
+   select IOMMU_PAGE_FAULT
select IOMMU_API
select MMU_NOTIFIER
 
diff --git a/drivers/iommu/iommu-sva.c b/drivers/iommu/iommu-sva.c
index b177d6cbf4fff..00d5e7e895e80 100644
--- a/drivers/iommu/iommu-sva.c
+++ b/drivers/iommu/iommu-sva.c
@@ -420,6 +420,12 @@ void iommu_sva_unbind_generic(struct iommu_sva *handle)
if (WARN_ON(!param))
return;
 
+   /*
+* Caller stopped the device from issuing PASIDs, now make sure they are
+* out of the fault queue.
+*/
+   iopf_queue_flush_dev(handle->dev, bond->io_mm->pasid);
+
mutex_lock(>sva_lock);
mutex_lock(_sva_lock);
io_mm_detach(bond);
@@ -457,6 +463,10 @@ int iommu_sva_enable(struct device *dev, struct 
iommu_sva_param *sva_param)
goto err_unlock;
}
 
+   ret = iommu_register_device_fault_handler(dev, iommu_queue_iopf, dev);
+   if (ret)
+   goto err_unlock;
+
dev->iommu->sva_param = new_param;
mutex_unlock(>sva_lock);
return 0;
@@ -494,6 +504,7 @@ int iommu_sva_disable(struct device *dev)
goto out_unlock;
}
 
+   iommu_unregister_device_fault_handler(dev);
kfree(param->sva_param);
param->sva_param = NULL;
 out_unlock:
-- 
2.26.0

___
iommu mailing list
iommu@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/iommu


[PATCH v5 11/25] arm64: cpufeature: Export symbol read_sanitised_ftr_reg()

2020-04-14 Thread Jean-Philippe Brucker
The SMMUv3 driver would like to read the MMFR0 PARANGE field in order to
share CPU page tables with devices. Allow the driver to be built as
module by exporting the read_sanitized_ftr_reg() cpufeature symbol.

Cc: Suzuki K Poulose 
Signed-off-by: Jean-Philippe Brucker 
---
 arch/arm64/kernel/cpufeature.c | 1 +
 1 file changed, 1 insertion(+)

diff --git a/arch/arm64/kernel/cpufeature.c b/arch/arm64/kernel/cpufeature.c
index 9fac745aa7bb2..5f6adbf4ae893 100644
--- a/arch/arm64/kernel/cpufeature.c
+++ b/arch/arm64/kernel/cpufeature.c
@@ -841,6 +841,7 @@ u64 read_sanitised_ftr_reg(u32 id)
BUG_ON(!regp);
return regp->sys_val;
 }
+EXPORT_SYMBOL_GPL(read_sanitised_ftr_reg);
 
 #define read_sysreg_case(r)\
case r: return read_sysreg_s(r)
-- 
2.26.0

___
iommu mailing list
iommu@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/iommu


[PATCH v5 09/25] iommu/io-pgtable-arm: Move some definitions to a header

2020-04-14 Thread Jean-Philippe Brucker
Extract some of the most generic TCR defines, so they can be reused by
the page table sharing code.

Signed-off-by: Jean-Philippe Brucker 
---
 drivers/iommu/io-pgtable-arm.h | 30 ++
 drivers/iommu/io-pgtable-arm.c | 27 ++-
 2 files changed, 32 insertions(+), 25 deletions(-)
 create mode 100644 drivers/iommu/io-pgtable-arm.h

diff --git a/drivers/iommu/io-pgtable-arm.h b/drivers/iommu/io-pgtable-arm.h
new file mode 100644
index 0..ba7cfdf7afa03
--- /dev/null
+++ b/drivers/iommu/io-pgtable-arm.h
@@ -0,0 +1,30 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
+#ifndef IO_PGTABLE_ARM_H_
+#define IO_PGTABLE_ARM_H_
+
+#define ARM_LPAE_TCR_TG0_4K0
+#define ARM_LPAE_TCR_TG0_64K   1
+#define ARM_LPAE_TCR_TG0_16K   2
+
+#define ARM_LPAE_TCR_TG1_16K   1
+#define ARM_LPAE_TCR_TG1_4K2
+#define ARM_LPAE_TCR_TG1_64K   3
+
+#define ARM_LPAE_TCR_SH_NS 0
+#define ARM_LPAE_TCR_SH_OS 2
+#define ARM_LPAE_TCR_SH_IS 3
+
+#define ARM_LPAE_TCR_RGN_NC0
+#define ARM_LPAE_TCR_RGN_WBWA  1
+#define ARM_LPAE_TCR_RGN_WT2
+#define ARM_LPAE_TCR_RGN_WB3
+
+#define ARM_LPAE_TCR_PS_32_BIT 0x0ULL
+#define ARM_LPAE_TCR_PS_36_BIT 0x1ULL
+#define ARM_LPAE_TCR_PS_40_BIT 0x2ULL
+#define ARM_LPAE_TCR_PS_42_BIT 0x3ULL
+#define ARM_LPAE_TCR_PS_44_BIT 0x4ULL
+#define ARM_LPAE_TCR_PS_48_BIT 0x5ULL
+#define ARM_LPAE_TCR_PS_52_BIT 0x6ULL
+
+#endif /* IO_PGTABLE_ARM_H_ */
diff --git a/drivers/iommu/io-pgtable-arm.c b/drivers/iommu/io-pgtable-arm.c
index 04fbd4bf0ff9f..f71a2eade04ab 100644
--- a/drivers/iommu/io-pgtable-arm.c
+++ b/drivers/iommu/io-pgtable-arm.c
@@ -20,6 +20,8 @@
 
 #include 
 
+#include "io-pgtable-arm.h"
+
 #define ARM_LPAE_MAX_ADDR_BITS 52
 #define ARM_LPAE_S2_MAX_CONCAT_PAGES   16
 #define ARM_LPAE_MAX_LEVELS4
@@ -100,23 +102,6 @@
 #define ARM_LPAE_PTE_MEMATTR_DEV   (((arm_lpae_iopte)0x1) << 2)
 
 /* Register bits */
-#define ARM_LPAE_TCR_TG0_4K0
-#define ARM_LPAE_TCR_TG0_64K   1
-#define ARM_LPAE_TCR_TG0_16K   2
-
-#define ARM_LPAE_TCR_TG1_16K   1
-#define ARM_LPAE_TCR_TG1_4K2
-#define ARM_LPAE_TCR_TG1_64K   3
-
-#define ARM_LPAE_TCR_SH_NS 0
-#define ARM_LPAE_TCR_SH_OS 2
-#define ARM_LPAE_TCR_SH_IS 3
-
-#define ARM_LPAE_TCR_RGN_NC0
-#define ARM_LPAE_TCR_RGN_WBWA  1
-#define ARM_LPAE_TCR_RGN_WT2
-#define ARM_LPAE_TCR_RGN_WB3
-
 #define ARM_LPAE_VTCR_SL0_MASK 0x3
 
 #define ARM_LPAE_TCR_T0SZ_SHIFT0
@@ -124,14 +109,6 @@
 #define ARM_LPAE_VTCR_PS_SHIFT 16
 #define ARM_LPAE_VTCR_PS_MASK  0x7
 
-#define ARM_LPAE_TCR_PS_32_BIT 0x0ULL
-#define ARM_LPAE_TCR_PS_36_BIT 0x1ULL
-#define ARM_LPAE_TCR_PS_40_BIT 0x2ULL
-#define ARM_LPAE_TCR_PS_42_BIT 0x3ULL
-#define ARM_LPAE_TCR_PS_44_BIT 0x4ULL
-#define ARM_LPAE_TCR_PS_48_BIT 0x5ULL
-#define ARM_LPAE_TCR_PS_52_BIT 0x6ULL
-
 #define ARM_LPAE_MAIR_ATTR_SHIFT(n)((n) << 3)
 #define ARM_LPAE_MAIR_ATTR_MASK0xff
 #define ARM_LPAE_MAIR_ATTR_DEVICE  0x04
-- 
2.26.0

___
iommu mailing list
iommu@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/iommu


[PATCH v5 08/25] arm64: mm: Pin down ASIDs for sharing mm with devices

2020-04-14 Thread Jean-Philippe Brucker
To enable address space sharing with the IOMMU, introduce mm_context_get()
and mm_context_put(), that pin down a context and ensure that it will keep
its ASID after a rollover. Export the symbols to let the modular SMMUv3
driver use them.

Pinning is necessary because a device constantly needs a valid ASID,
unlike tasks that only require one when running. Without pinning, we would
need to notify the IOMMU when we're about to use a new ASID for a task,
and it would get complicated when a new task is assigned a shared ASID.
Consider the following scenario with no ASID pinned:

1. Task t1 is running on CPUx with shared ASID (gen=1, asid=1)
2. Task t2 is scheduled on CPUx, gets ASID (1, 2)
3. Task tn is scheduled on CPUy, a rollover occurs, tn gets ASID (2, 1)
   We would now have to immediately generate a new ASID for t1, notify
   the IOMMU, and finally enable task tn. We are holding the lock during
   all that time, since we can't afford having another CPU trigger a
   rollover. The IOMMU issues invalidation commands that can take tens of
   milliseconds.

It gets needlessly complicated. All we wanted to do was schedule task tn,
that has no business with the IOMMU. By letting the IOMMU pin tasks when
needed, we avoid stalling the slow path, and let the pinning fail when
we're out of shareable ASIDs.

After a rollover, the allocator expects at least one ASID to be available
in addition to the reserved ones (one per CPU). So (NR_ASIDS - NR_CPUS -
1) is the maximum number of ASIDs that can be shared with the IOMMU.

Signed-off-by: Jean-Philippe Brucker 
---
v4->v5: extract helper macro
---
 arch/arm64/include/asm/mmu.h |  1 +
 arch/arm64/include/asm/mmu_context.h | 11 +++-
 arch/arm64/mm/context.c  | 95 +++-
 3 files changed, 104 insertions(+), 3 deletions(-)

diff --git a/arch/arm64/include/asm/mmu.h b/arch/arm64/include/asm/mmu.h
index 68140fdd89d6b..bbdd291e31d59 100644
--- a/arch/arm64/include/asm/mmu.h
+++ b/arch/arm64/include/asm/mmu.h
@@ -19,6 +19,7 @@
 
 typedef struct {
atomic64_t  id;
+   unsigned long   pinned;
void*vdso;
unsigned long   flags;
 } mm_context_t;
diff --git a/arch/arm64/include/asm/mmu_context.h 
b/arch/arm64/include/asm/mmu_context.h
index ab46187c63001..69599a64945b0 100644
--- a/arch/arm64/include/asm/mmu_context.h
+++ b/arch/arm64/include/asm/mmu_context.h
@@ -177,7 +177,13 @@ static inline void cpu_replace_ttbr1(pgd_t *pgdp)
 #define destroy_context(mm)do { } while(0)
 void check_and_switch_context(struct mm_struct *mm, unsigned int cpu);
 
-#define init_new_context(tsk,mm)   ({ atomic64_set(&(mm)->context.id, 0); 
0; })
+static inline int
+init_new_context(struct task_struct *tsk, struct mm_struct *mm)
+{
+   atomic64_set(>context.id, 0);
+   mm->context.pinned = 0;
+   return 0;
+}
 
 #ifdef CONFIG_ARM64_SW_TTBR0_PAN
 static inline void update_saved_ttbr0(struct task_struct *tsk,
@@ -250,6 +256,9 @@ switch_mm(struct mm_struct *prev, struct mm_struct *next,
 void verify_cpu_asid_bits(void);
 void post_ttbr_update_workaround(void);
 
+unsigned long mm_context_get(struct mm_struct *mm);
+void mm_context_put(struct mm_struct *mm);
+
 #endif /* !__ASSEMBLY__ */
 
 #endif /* !__ASM_MMU_CONTEXT_H */
diff --git a/arch/arm64/mm/context.c b/arch/arm64/mm/context.c
index d702d60e64dab..d0ddd413f5645 100644
--- a/arch/arm64/mm/context.c
+++ b/arch/arm64/mm/context.c
@@ -27,6 +27,10 @@ static DEFINE_PER_CPU(atomic64_t, active_asids);
 static DEFINE_PER_CPU(u64, reserved_asids);
 static cpumask_t tlb_flush_pending;
 
+static unsigned long max_pinned_asids;
+static unsigned long nr_pinned_asids;
+static unsigned long *pinned_asid_map;
+
 #define ASID_MASK  (~GENMASK(asid_bits - 1, 0))
 #define ASID_FIRST_VERSION (1UL << asid_bits)
 
@@ -74,6 +78,9 @@ void verify_cpu_asid_bits(void)
 
 static void set_kpti_asid_bits(void)
 {
+   unsigned int k;
+   u8 *dst = (u8 *)asid_map;
+   u8 *src = (u8 *)pinned_asid_map;
unsigned int len = BITS_TO_LONGS(NUM_USER_ASIDS) * sizeof(unsigned 
long);
/*
 * In case of KPTI kernel/user ASIDs are allocated in
@@ -81,7 +88,8 @@ static void set_kpti_asid_bits(void)
 * is set, then the ASID will map only userspace. Thus
 * mark even as reserved for kernel.
 */
-   memset(asid_map, 0xaa, len);
+   for (k = 0; k < len; k++)
+   dst[k] = src[k] | 0xaa;
 }
 
 static void set_reserved_asid_bits(void)
@@ -89,7 +97,7 @@ static void set_reserved_asid_bits(void)
if (arm64_kernel_unmapped_at_el0())
set_kpti_asid_bits();
else
-   bitmap_clear(asid_map, 0, NUM_USER_ASIDS);
+   bitmap_copy(asid_map, pinned_asid_map, NUM_USER_ASIDS);
 }
 
 #define asid_gen_match(asid) \
@@ -165,6 +173,14 @@ static u64 new_context(struct mm_struct *mm)
if (check_update_reserved_asid(asid, newasid))

[PATCH v5 17/25] iommu/arm-smmu-v3: Implement mm operations

2020-04-14 Thread Jean-Philippe Brucker
Hook SVA operations to support sharing page tables with the SMMUv3:

* dev_enable/disable/has_feature for device drivers to modify the SVA
  state.
* sva_bind/unbind and sva_get_pasid to bind device and address spaces.
* The mm_attach/detach/clear/invalidate/free callbacks from iommu-sva

The clear() operation has to detach the page tables while DMA may still
be running (because the process died). To avoid any event 0x0a print
(C_BAD_CD) we disable translation and error reporting, without clearing
CD.V. PCIe Translation Requests and Page Requests are silently denied.
The detach() operation always happens whether or not clear() was
invoked, and properly disables the CD.

Signed-off-by: Jean-Philippe Brucker 
---
v4->v5: Add clear() operation
---
 drivers/iommu/Kconfig   |   1 +
 drivers/iommu/arm-smmu-v3.c | 214 +++-
 2 files changed, 211 insertions(+), 4 deletions(-)

diff --git a/drivers/iommu/Kconfig b/drivers/iommu/Kconfig
index 411a7ee2ab12d..8118f090a51b3 100644
--- a/drivers/iommu/Kconfig
+++ b/drivers/iommu/Kconfig
@@ -435,6 +435,7 @@ config ARM_SMMU_V3
tristate "ARM Ltd. System MMU Version 3 (SMMUv3) Support"
depends on ARM64
select IOMMU_API
+   select IOMMU_SVA
select IOMMU_IO_PGTABLE_LPAE
select GENERIC_MSI_IRQ_DOMAIN
help
diff --git a/drivers/iommu/arm-smmu-v3.c b/drivers/iommu/arm-smmu-v3.c
index d209d85402a83..6640c2ac2a7c5 100644
--- a/drivers/iommu/arm-smmu-v3.c
+++ b/drivers/iommu/arm-smmu-v3.c
@@ -36,6 +36,7 @@
 #include 
 
 #include "io-pgtable-arm.h"
+#include "iommu-sva.h"
 
 /* MMIO registers */
 #define ARM_SMMU_IDR0  0x0
@@ -739,6 +740,13 @@ struct arm_smmu_option_prop {
 static DEFINE_XARRAY_ALLOC1(asid_xa);
 static DEFINE_SPINLOCK(contexts_lock);
 
+/*
+ * When a process dies, DMA is still running but we need to clear the pgd. If 
we
+ * simply cleared the valid bit from the context descriptor, we'd get event 
0x0a
+ * which are not recoverable.
+ */
+static struct arm_smmu_ctx_desc invalid_cd = { 0 };
+
 static struct arm_smmu_option_prop arm_smmu_options[] = {
{ ARM_SMMU_OPT_SKIP_PREFETCH, "hisilicon,broken-prefetch-cmd" },
{ ARM_SMMU_OPT_PAGE0_REGS_ONLY, "cavium,cn9900-broken-page1-regspace"},
@@ -1649,7 +1657,9 @@ static int __arm_smmu_write_ctx_desc(struct 
arm_smmu_domain *smmu_domain,
 * (2) Install a secondary CD, for SID+SSID traffic.
 * (3) Update ASID of a CD. Atomically write the first 64 bits of the
 * CD, then invalidate the old entry and mappings.
-* (4) Remove a secondary CD.
+* (4) Quiesce the context without clearing the valid bit. Disable
+* translation, and ignore any translation fault.
+* (5) Remove a secondary CD.
 */
u64 val;
bool cd_live;
@@ -1666,8 +1676,11 @@ static int __arm_smmu_write_ctx_desc(struct 
arm_smmu_domain *smmu_domain,
val = le64_to_cpu(cdptr[0]);
cd_live = !!(val & CTXDESC_CD_0_V);
 
-   if (!cd) { /* (4) */
+   if (!cd) { /* (5) */
val = 0;
+   } else if (cd == _cd) { /* (4) */
+   val &= ~(CTXDESC_CD_0_S | CTXDESC_CD_0_R);
+   val |= CTXDESC_CD_0_TCR_EPD0;
} else if (cd_live) { /* (3) */
val &= ~CTXDESC_CD_0_ASID;
val |= FIELD_PREP(CTXDESC_CD_0_ASID, cd->asid);
@@ -1884,7 +1897,6 @@ static struct arm_smmu_ctx_desc *arm_smmu_share_asid(u16 
asid)
return NULL;
 }
 
-__maybe_unused
 static struct arm_smmu_ctx_desc *arm_smmu_alloc_shared_cd(struct mm_struct *mm)
 {
u16 asid;
@@ -1978,7 +1990,6 @@ static struct arm_smmu_ctx_desc 
*arm_smmu_alloc_shared_cd(struct mm_struct *mm)
return ERR_PTR(ret);
 }
 
-__maybe_unused
 static void arm_smmu_free_shared_cd(struct arm_smmu_ctx_desc *cd)
 {
if (arm_smmu_free_asid(cd)) {
@@ -3008,6 +3019,16 @@ static int arm_smmu_attach_dev(struct iommu_domain 
*domain, struct device *dev)
master = dev_iommu_priv_get(dev);
smmu = master->smmu;
 
+   /*
+* Checking that SVA is disabled ensures that this device isn't bound to
+* any mm, and can be safely detached from its old domain. Bonds cannot
+* be removed concurrently since we're holding the group mutex.
+*/
+   if (iommu_sva_enabled(dev)) {
+   dev_err(dev, "cannot attach - SVA enabled\n");
+   return -EBUSY;
+   }
+
arm_smmu_detach_dev(master);
 
mutex_lock(_domain->init_mutex);
@@ -3107,6 +3128,99 @@ arm_smmu_iova_to_phys(struct iommu_domain *domain, 
dma_addr_t iova)
return ops->iova_to_phys(ops, iova);
 }
 
+static void arm_smmu_mm_invalidate(struct device *dev, int pasid, void *entry,
+  unsigned long iova, size_t size)
+{
+   /* TODO: Invalidate ATC */
+}
+
+static int arm_smmu_mm_attach(struct device *dev, int pasid, void *entry,
+ bool 

[PATCH v5 07/25] arm64: mm: Add asid_gen_match() helper

2020-04-14 Thread Jean-Philippe Brucker
Add a macro to check if an ASID is from the current generation, since a
subsequent patch will introduce a third user for this test.

Signed-off-by: Jean-Philippe Brucker 
---
v4->v5: new
---
 arch/arm64/mm/context.c | 8 +---
 1 file changed, 5 insertions(+), 3 deletions(-)

diff --git a/arch/arm64/mm/context.c b/arch/arm64/mm/context.c
index 9b26f9a88724f..d702d60e64dab 100644
--- a/arch/arm64/mm/context.c
+++ b/arch/arm64/mm/context.c
@@ -92,6 +92,9 @@ static void set_reserved_asid_bits(void)
bitmap_clear(asid_map, 0, NUM_USER_ASIDS);
 }
 
+#define asid_gen_match(asid) \
+   (!(((asid) ^ atomic64_read(_generation)) >> asid_bits))
+
 static void flush_context(void)
 {
int i;
@@ -220,8 +223,7 @@ void check_and_switch_context(struct mm_struct *mm, 
unsigned int cpu)
 *   because atomic RmWs are totally ordered for a given location.
 */
old_active_asid = atomic64_read(_cpu(active_asids, cpu));
-   if (old_active_asid &&
-   !((asid ^ atomic64_read(_generation)) >> asid_bits) &&
+   if (old_active_asid && asid_gen_match(asid) &&
atomic64_cmpxchg_relaxed(_cpu(active_asids, cpu),
 old_active_asid, asid))
goto switch_mm_fastpath;
@@ -229,7 +231,7 @@ void check_and_switch_context(struct mm_struct *mm, 
unsigned int cpu)
raw_spin_lock_irqsave(_asid_lock, flags);
/* Check that our ASID belongs to the current generation. */
asid = atomic64_read(>context.id);
-   if ((asid ^ atomic64_read(_generation)) >> asid_bits) {
+   if (!asid_gen_match(asid)) {
asid = new_context(mm);
atomic64_set(>context.id, asid);
}
-- 
2.26.0

___
iommu mailing list
iommu@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/iommu


[PATCH v5 15/25] iommu/arm-smmu-v3: Enable broadcast TLB maintenance

2020-04-14 Thread Jean-Philippe Brucker
The SMMUv3 can handle invalidation targeted at TLB entries with shared
ASIDs. If the implementation supports broadcast TLB maintenance, enable it
and keep track of it in a feature bit. The SMMU will then be affected by
inner-shareable TLB invalidations from other agents.

A major side-effect of this change is that stage-2 translation contexts
are now affected by all invalidations by VMID. VMIDs are all shared and
the only ways to prevent over-invalidation, since the stage-2 page tables
are not shared between CPU and SMMU, are to either disable BTM or allocate
different VMIDs. This patch does not address the problem.

Signed-off-by: Jean-Philippe Brucker 
---
v4->v5: bump feature bit
---
 drivers/iommu/arm-smmu-v3.c | 19 +--
 1 file changed, 17 insertions(+), 2 deletions(-)

diff --git a/drivers/iommu/arm-smmu-v3.c b/drivers/iommu/arm-smmu-v3.c
index 21d458d817fc2..e7de8a7459fa4 100644
--- a/drivers/iommu/arm-smmu-v3.c
+++ b/drivers/iommu/arm-smmu-v3.c
@@ -56,6 +56,7 @@
 #define IDR0_ASID16(1 << 12)
 #define IDR0_ATS   (1 << 10)
 #define IDR0_HYP   (1 << 9)
+#define IDR0_BTM   (1 << 5)
 #define IDR0_COHACC(1 << 4)
 #define IDR0_TTF   GENMASK(3, 2)
 #define IDR0_TTF_AARCH64   2
@@ -655,6 +656,7 @@ struct arm_smmu_device {
 #define ARM_SMMU_FEAT_VAX  (1 << 14)
 #define ARM_SMMU_FEAT_RANGE_INV(1 << 15)
 #define ARM_SMMU_FEAT_E2H  (1 << 16)
+#define ARM_SMMU_FEAT_BTM  (1 << 17)
u32 features;
 
 #define ARM_SMMU_OPT_SKIP_PREFETCH (1 << 0)
@@ -3814,11 +3816,14 @@ static int arm_smmu_device_reset(struct arm_smmu_device 
*smmu, bool bypass)
writel_relaxed(reg, smmu->base + ARM_SMMU_CR1);
 
/* CR2 (random crap) */
-   reg = CR2_PTM | CR2_RECINVSID;
+   reg = CR2_RECINVSID;
 
if (smmu->features & ARM_SMMU_FEAT_E2H)
reg |= CR2_E2H;
 
+   if (!(smmu->features & ARM_SMMU_FEAT_BTM))
+   reg |= CR2_PTM;
+
writel_relaxed(reg, smmu->base + ARM_SMMU_CR2);
 
/* Stream table */
@@ -3929,6 +3934,7 @@ static int arm_smmu_device_hw_probe(struct 
arm_smmu_device *smmu)
 {
u32 reg;
bool coherent = smmu->features & ARM_SMMU_FEAT_COHERENCY;
+   bool vhe = cpus_have_cap(ARM64_HAS_VIRT_HOST_EXTN);
 
/* IDR0 */
reg = readl_relaxed(smmu->base + ARM_SMMU_IDR0);
@@ -3978,10 +3984,19 @@ static int arm_smmu_device_hw_probe(struct 
arm_smmu_device *smmu)
 
if (reg & IDR0_HYP) {
smmu->features |= ARM_SMMU_FEAT_HYP;
-   if (cpus_have_cap(ARM64_HAS_VIRT_HOST_EXTN))
+   if (vhe)
smmu->features |= ARM_SMMU_FEAT_E2H;
}
 
+   /*
+* If the CPU is using VHE, but the SMMU doesn't support it, the SMMU
+* will create TLB entries for NH-EL1 world and will miss the
+* broadcasted TLB invalidations that target EL2-E2H world. Don't enable
+* BTM in that case.
+*/
+   if (reg & IDR0_BTM && (!vhe || reg & IDR0_HYP))
+   smmu->features |= ARM_SMMU_FEAT_BTM;
+
/*
 * The coherency feature as set by FW is used in preference to the ID
 * register, but warn on mismatch.
-- 
2.26.0

___
iommu mailing list
iommu@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/iommu


Re: [PATCH 2/2] iommu/arm-smmu: Allow client devices to select direct mapping

2020-04-14 Thread Sai Prakash Ranjan

Hi Evan,

On 2020-04-14 04:42, Evan Green wrote:

On Wed, Jan 22, 2020 at 3:48 AM Sai Prakash Ranjan
 wrote:


From: Jordan Crouse 

Some client devices want to directly map the IOMMU themselves instead
of using the DMA domain. Allow those devices to opt in to direct
mapping by way of a list of compatible strings.

Signed-off-by: Jordan Crouse 
Co-developed-by: Sai Prakash Ranjan 
Signed-off-by: Sai Prakash Ranjan 
---
 drivers/iommu/arm-smmu-qcom.c | 39 
+++

 drivers/iommu/arm-smmu.c  |  3 +++
 drivers/iommu/arm-smmu.h  |  5 +
 3 files changed, 47 insertions(+)

diff --git a/drivers/iommu/arm-smmu-qcom.c 
b/drivers/iommu/arm-smmu-qcom.c

index 64a4ab270ab7..ff746acd1c81 100644
--- a/drivers/iommu/arm-smmu-qcom.c
+++ b/drivers/iommu/arm-smmu-qcom.c
@@ -3,6 +3,7 @@
  * Copyright (c) 2019, The Linux Foundation. All rights reserved.
  */

+#include 
 #include 

 #include "arm-smmu.h"
@@ -11,6 +12,43 @@ struct qcom_smmu {
struct arm_smmu_device smmu;
 };

+static const struct arm_smmu_client_match_data qcom_adreno = {
+   .direct_mapping = true,
+};
+
+static const struct arm_smmu_client_match_data qcom_mdss = {
+   .direct_mapping = true,


I don't actually see direct_mapping being used. Shouldn't this member
be checked somewhere?



Thanks for spotting this, my bad. It should be checked in 
qcom_smmu_request_domain().


diff --git a/drivers/iommu/arm-smmu-qcom.c 
b/drivers/iommu/arm-smmu-qcom.c

index ff746acd1c81..3ff62ca13ad5 100644
--- a/drivers/iommu/arm-smmu-qcom.c
+++ b/drivers/iommu/arm-smmu-qcom.c
@@ -43,7 +43,7 @@ static int qcom_smmu_request_domain(struct device 
*dev)

const struct arm_smmu_client_match_data *client;

client = qcom_smmu_client_data(dev);
-   if (client)
+   if (client && client->direct_mapping)
iommu_request_dm_for_dev(dev);

return 0;

-Sai

--
QUALCOMM INDIA, on behalf of Qualcomm Innovation Center, Inc. is a 
member

of Code Aurora Forum, hosted by The Linux Foundation
___
iommu mailing list
iommu@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/iommu


Re: [PATCH v2 1/3] iommu/uapi: Define uapi version and capabilities

2020-04-14 Thread Alex Williamson
On Mon, 13 Apr 2020 22:05:15 -0700
Jacob Pan  wrote:

> Hi Alex,
> Thanks a lot for the feedback, my comments inline.
> 
> On Mon, 13 Apr 2020 16:21:29 -0600
> Alex Williamson  wrote:
> 
> > On Mon, 13 Apr 2020 13:41:57 -0700
> > Jacob Pan  wrote:
> >   
> > > Hi All,
> > > 
> > > Just a gentle reminder, any feedback on the options I listed below?
> > > New ideas will be even better.
> > > 
> > > Christoph, does the explanation make sense to you? We do have the
> > > capability/flag based scheme for IOMMU API extension, the version is
> > > mainly used for size lookup. Compatibility checking is another use
> > > of the version, it makes checking easy when a vIOMMU is launched.
> > > 
> > > Thanks,
> > > 
> > > Jacob
> > > 
> > > On Thu, 2 Apr 2020 11:36:04 -0700
> > > Jacob Pan  wrote:
> > > 
> > > > On Wed, 1 Apr 2020 05:32:21 +
> > > > "Tian, Kevin"  wrote:
> > > >   
> > > > > > From: Jacob Pan 
> > > > > > Sent: Tuesday, March 31, 2020 11:55 PM
> > > > > > 
> > > > > > On Tue, 31 Mar 2020 06:06:38 +
> > > > > > "Tian, Kevin"  wrote:
> > > > > >   
> > > > > > > > From: Jacob Pan 
> > > > > > > > Sent: Tuesday, March 31, 2020 12:08 AM
> > > > > > > >
> > > > > > > > On Mon, 30 Mar 2020 05:40:40 +
> > > > > > > > "Tian, Kevin"  wrote:
> > > > > > > >  
> > > > > > > > > > From: Jacob Pan 
> > > > > > > > > > Sent: Saturday, March 28, 2020 7:54 AM
> > > > > > > > > >
> > > > > > > > > > On Fri, 27 Mar 2020 00:47:02 -0700
> > > > > > > > > > Christoph Hellwig  wrote:
> > > > > > > > > >  
> > > > > > > > > > > On Fri, Mar 27, 2020 at 02:49:55AM +, Tian,
> > > > > > > > > > > Kevin wrote:  
> > > > > > > > > > > > If those API calls are inter-dependent for
> > > > > > > > > > > > composing a feature (e.g. SVA), shouldn't we need
> > > > > > > > > > > > a way to check them together before exposing the
> > > > > > > > > > > > feature to the guest, e.g. through a
> > > > > > > > > > > > iommu_get_uapi_capabilities interface?  
> > > > > > > > > > >
> > > > > > > > > > > Yes, that makes sense.  The important bit is to
> > > > > > > > > > > have a capability flags and not version
> > > > > > > > > > > numbers.  
> > > > > > > > > >
> > > > > > > > > > The challenge is that there are two consumers in the
> > > > > > > > > > kernel for this. 1. VFIO only look for compatibility,
> > > > > > > > > > and size of each data struct such that it can
> > > > > > > > > > copy_from_user.
> > > > > > > > > >
> > > > > > > > > > 2. IOMMU driver, the "real consumer" of the content.
> > > > > > > > > >
> > > > > > > > > > For 2, I agree and we do plan to use the capability
> > > > > > > > > > flags to check content and maintain backward
> > > > > > > > > > compatibility etc.
> > > > > > > > > >
> > > > > > > > > > For VFIO, it is difficult to do size look up based on
> > > > > > > > > > capability flags.  
> > > > > > > > >
> > > > > > > > > Can you elaborate the difficulty in VFIO? if, as
> > > > > > > > > Christoph Hellwig pointed out, version number is
> > > > > > > > > already avoided everywhere, it is interesting to know
> > > > > > > > > whether this work becomes a real exception or just
> > > > > > > > > requires a different mindset.   
> > > > > > > > From VFIO p.o.v. the IOMMU UAPI data is opaque, it only
> > > > > > > > needs to do two things:
> > > > > > > > 1. is the UAPI compatible?
> > > > > > > > 2. what is the size to copy?
> > > > > > > >
> > > > > > > > If you look at the version number, this is really a
> > > > > > > > "version as size" lookup, as provided by the helper
> > > > > > > > function in this patch. An example can be the newly
> > > > > > > > introduced clone3 syscall.
> > > > > > > > https://lwn.net/Articles/792628/ In clone3, new version
> > > > > > > > must have new size. The slight difference here is that,
> > > > > > > > unlike clone3, we have multiple data structures instead
> > > > > > > > of a single struct clone_args {}. And each struct has
> > > > > > > > flags to enumerate its contents besides size.  
> > > > > > >
> > > > > > > Thanks for providing that link. However clone3 doesn't
> > > > > > > include a version field to do "version as size" lookup.
> > > > > > > Instead, as you said, it includes a size parameter which
> > > > > > > sounds like the option 3 (argsz) listed below.
> > > > > > >  
> > > > > > Right, there is no version in clone3. size = version. I view
> > > > > > this as a 1:1 lookup.
> > > > > >   
> > > > > > > >
> > > > > > > > Besides breaching data abstraction, if VFIO has to check
> > > > > > > > IOMMU flags to determine the sizes, it has many
> > > > > > > > combinations.
> > > > > > > >
> > > > > > > > We also separate the responsibilities into two parts
> > > > > > > > 1. compatibility - version, size by VFIO
> > > > > > > > 2. sanity check - capability flags - by IOMMU  
> > > > > > >
> > > > > > > I feel argsz+flags approach can perfectly meet 

Re: [PATCH v2 1/3] iommu/uapi: Define uapi version and capabilities

2020-04-14 Thread Jacob Pan
On Tue, 14 Apr 2020 01:11:07 -0700
Christoph Hellwig  wrote:

> On Mon, Apr 13, 2020 at 01:41:57PM -0700, Jacob Pan wrote:
> > Hi All,
> > 
> > Just a gentle reminder, any feedback on the options I listed below?
> > New ideas will be even better.
> > 
> > Christoph, does the explanation make sense to you? We do have the
> > capability/flag based scheme for IOMMU API extension, the version is
> > mainly used for size lookup. Compatibility checking is another use
> > of the version, it makes checking easy when a vIOMMU is launched.  
> 
> No.  If you truely need different versions use different ioctl
> identifiers.
OK. I will drop the global version and keep the current per API/IOCTL
struct.

>  If it really is just the size pass the size and not a
> version.
OK, I think we have a path forward. I will remove the size-version
lookup.

My concern was that since we cannot trust the size provided by
userspace. We must sanity check the argsz based on knowledge of the
struct within the kernel. AFAIK, VFIO does this check by looking at
offsetofend(user_struct, last_element). But in this case, VFIO is not
the end consumer, and last_element can be a variable size union.
So we'd better let IOMMU driver deal with it.
___
iommu mailing list
iommu@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/iommu


Re: [RFC PATCH 11/34] iommu: Split off default domain allocation from group assignment

2020-04-14 Thread j...@8bytes.org
Hi Jonathan,

On Mon, Apr 13, 2020 at 10:10:50PM +, Derrick, Jonathan wrote:
> I had to add the following for initial VMD support. The new PCIe domain
> added on VMD endpoint probe didn't have the dev_iommu member set on the
> VMD subdevices, which I'm guessing is due to probe_iommu_group already
> having been run on the VMD endpoint's group prior to those subdevices
> being added.
> 
> diff --git a/drivers/iommu/iommu.c b/drivers/iommu/iommu.c
> index 8a5e1ac328dd..ac1e4fb9bf48 100644
> --- a/drivers/iommu/iommu.c
> +++ b/drivers/iommu/iommu.c
> @@ -1577,6 +1577,9 @@ static int iommu_bus_notifier(struct notifier_block *nb,
> if (action == BUS_NOTIFY_ADD_DEVICE) {
> int ret;
>  
> +   if (!dev_iommu_get(dev))
> +   return -ENOMEM;
> +
> ret = iommu_probe_device(dev);
> return (ret) ? NOTIFY_DONE : NOTIFY_OK;
> } else if (action == BUS_NOTIFY_REMOVED_DEVICE) {

Right, thanks for catching this. The hotplug path does not allocate the
dev->iommu structure yet. I'll have to figure out if the above patch
adds it at the right place, but I'll fix it in the next version.

Thanks again,

Joerg
___
iommu mailing list
iommu@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/iommu


Re: [PATCH v1 2/2] vfio/pci: Emulate PASID/PRI capability for VFs

2020-04-14 Thread Alex Williamson
On Tue, 14 Apr 2020 03:42:42 +
"Tian, Kevin"  wrote:

> > From: Alex Williamson 
> > Sent: Tuesday, April 14, 2020 11:29 AM
> > 
> > On Tue, 14 Apr 2020 02:40:58 +
> > "Tian, Kevin"  wrote:
> >   
> > > > From: Alex Williamson 
> > > > Sent: Tuesday, April 14, 2020 3:21 AM
> > > >
> > > > On Mon, 13 Apr 2020 08:05:33 +
> > > > "Tian, Kevin"  wrote:
> > > >  
> > > > > > From: Tian, Kevin
> > > > > > Sent: Monday, April 13, 2020 3:55 PM
> > > > > >  
> > > > > > > From: Raj, Ashok 
> > > > > > > Sent: Monday, April 13, 2020 11:11 AM
> > > > > > >
> > > > > > > On Wed, Apr 08, 2020 at 10:19:40AM -0600, Alex Williamson wrote:  
> > > > > > > > On Tue, 7 Apr 2020 21:00:21 -0700
> > > > > > > > "Raj, Ashok"  wrote:
> > > > > > > >  
> > > > > > > > > Hi Alex
> > > > > > > > >
> > > > > > > > > + Bjorn  
> > > > > > > >
> > > > > > > >  + Don
> > > > > > > >  
> > > > > > > > > FWIW I can't understand why PCI SIG went different ways with  
> > ATS,  
> > > > > > > > > where its enumerated on PF and VF. But for PASID and PRI its  
> > only  
> > > > > > > > > in PF.
> > > > > > > > >
> > > > > > > > > I'm checking with our internal SIG reps to followup on that.
> > > > > > > > >
> > > > > > > > > On Tue, Apr 07, 2020 at 09:58:01AM -0600, Alex Williamson  
> > wrote:  
> > > > > > > > > > > Is there vendor guarantee that hidden registers will 
> > > > > > > > > > > locate at  
> > the  
> > > > > > > > > > > same offset between PF and VF config space?  
> > > > > > > > > >
> > > > > > > > > > I'm not sure if the spec really precludes hidden registers, 
> > > > > > > > > > but  
> > the  
> > > > > > > > > > fact that these registers are explicitly outside of the 
> > > > > > > > > > capability
> > > > > > > > > > chain implies they're only intended for device specific 
> > > > > > > > > > use, so  
> > I'd  
> > > > say  
> > > > > > > > > > there are no guarantees about anything related to these  
> > registers.  
> > > > > > > > >
> > > > > > > > > As you had suggested in the other thread, we could consider
> > > > > > > > > using the same offset as in PF, but even that's a better guess
> > > > > > > > > still not reliable.
> > > > > > > > >
> > > > > > > > > The other option is to maybe extend driver ops in the PF to  
> > expose  
> > > > > > > > > where the offsets should be. Sort of adding the quirk in the
> > > > > > > > > implementation.
> > > > > > > > >
> > > > > > > > > I'm not sure how prevalent are PASID and PRI in VF devices. 
> > > > > > > > > If  
> > SIG is  
> > > > > > > resisting  
> > > > > > > > > making VF's first class citizen, we might ask them to add 
> > > > > > > > > some  
> > > > verbiage  
> > > > > > > > > to suggest leave the same offsets as PF open to help 
> > > > > > > > > emulation  
> > > > software.  
> > > > > > > >
> > > > > > > > Even if we know where to expose these capabilities on the VF, 
> > > > > > > > it's  
> > not  
> > > > > > > > clear to me how we can actually virtualize the capability 
> > > > > > > > itself.  If
> > > > > > > > the spec defines, for example, an enable bit as r/w then 
> > > > > > > > software  
> > that  
> > > > > > > > interacts with that register expects the bit is settable.  
> > > > > > > > There's no
> > > > > > > > protocol for "try to set the bit and re-read it to see if the 
> > > > > > > > hardware
> > > > > > > > accepted it".  Therefore a capability with a fixed enable bit
> > > > > > > > representing the state of the PF, not settable by the VF, is
> > > > > > > > disingenuous to the spec.  
> > > > > > >
> > > > > > > I think we are all in violent agreement. A lot of times the pci 
> > > > > > > spec  
> > gets  
> > > > > > > defined several years ahead of real products and no one  
> > remembers  
> > > > > > > the justification on why they restricted things the way they did.
> > > > > > >
> > > > > > > Maybe someone early product wasn't quite exposing these features  
> > to  
> > > > the  
> > > > > > > VF
> > > > > > > and hence the spec is bug compatible :-)
> > > > > > >  
> > > > > > > >
> > > > > > > > If what we're trying to do is expose that PASID and PRI are 
> > > > > > > > enabled  
> > on  
> > > > > > > > the PF to a VF driver, maybe duplicating the PF capabilities on 
> > > > > > > > the  
> > VF  
> > > > > > > > without the ability to control it is not the right approach.  
> > > > > > > > Maybe  
> > we  
> > > > > > >
> > > > > > > As long as the capability enable is only provided when the PF has 
> > > > > > >  
> > > > enabled  
> > > > > > > the feature. Then it seems the hardware seems to do the right 
> > > > > > > thing.
> > > > > > >
> > > > > > > Assume we expose PASID/PRI only when PF has enabled it. It will 
> > > > > > > be  
> > the  
> > > > > > > case since the PF driver needs to exist, and IOMMU would have set 
> > > > > > >  
> > the  
> > > > > > > PASID/PRI/ATS on PF.
> > > > > > >
> > > > > > > If the emulation is purely spoofing the capability. Once vIOMMU  
> > driver  
> > > 

Re: [PATCH 21/29] mm: remove the pgprot argument to __vmalloc

2020-04-14 Thread Wei Liu
On Tue, Apr 14, 2020 at 03:13:40PM +0200, Christoph Hellwig wrote:
> The pgprot argument to __vmalloc is always PROT_KERNEL now, so remove
> it.
> 
> Signed-off-by: Christoph Hellwig 
> Reviewed-by: Michael Kelley  [hyperv]
> Acked-by: Gao Xiang  [erofs]
> Acked-by: Peter Zijlstra (Intel) 
> ---
>  arch/x86/hyperv/hv_init.c  |  3 +--
[...]
> 
> diff --git a/arch/x86/hyperv/hv_init.c b/arch/x86/hyperv/hv_init.c
> index 5a4b363ba67b..a3d689dfc745 100644
> --- a/arch/x86/hyperv/hv_init.c
> +++ b/arch/x86/hyperv/hv_init.c
> @@ -95,8 +95,7 @@ static int hv_cpu_init(unsigned int cpu)
>* not be stopped in the case of CPU offlining and the VM will hang.
>*/
>   if (!*hvp) {
> - *hvp = __vmalloc(PAGE_SIZE, GFP_KERNEL | __GFP_ZERO,
> -  PAGE_KERNEL);
> + *hvp = __vmalloc(PAGE_SIZE, GFP_KERNEL | __GFP_ZERO);
>   }

Acked-by: Wei Liu 
___
iommu mailing list
iommu@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/iommu


[PATCH v11 11/13] iommu/smmuv3: Enforce incompatibility between nested mode and HW MSI regions

2020-04-14 Thread Eric Auger
Nested mode currently is not compatible with HW MSI reserved regions.
Indeed MSI transactions targeting this MSI doorbells bypass the SMMU.

Let's check nested mode is not attempted in such configuration.

Signed-off-by: Eric Auger 
---
 drivers/iommu/arm-smmu-v3.c | 23 +--
 1 file changed, 21 insertions(+), 2 deletions(-)

diff --git a/drivers/iommu/arm-smmu-v3.c b/drivers/iommu/arm-smmu-v3.c
index f157d1de614b..f4c793649152 100644
--- a/drivers/iommu/arm-smmu-v3.c
+++ b/drivers/iommu/arm-smmu-v3.c
@@ -2927,6 +2927,23 @@ static bool arm_smmu_share_msi_domain(struct 
iommu_domain *domain,
return share;
 }
 
+static bool arm_smmu_has_hw_msi_resv_region(struct device *dev)
+{
+   struct iommu_resv_region *region;
+   bool has_msi_resv_region = false;
+   LIST_HEAD(resv_regions);
+
+   iommu_get_resv_regions(dev, _regions);
+   list_for_each_entry(region, _regions, list) {
+   if (region->type == IOMMU_RESV_MSI) {
+   has_msi_resv_region = true;
+   break;
+   }
+   }
+   iommu_put_resv_regions(dev, _regions);
+   return has_msi_resv_region;
+}
+
 static int arm_smmu_attach_dev(struct iommu_domain *domain, struct device *dev)
 {
int ret = 0;
@@ -2971,10 +2988,12 @@ static int arm_smmu_attach_dev(struct iommu_domain 
*domain, struct device *dev)
/*
 * In nested mode we must check all devices belonging to the
 * domain share the same physical MSI doorbell. Otherwise nested
-* stage MSI binding is not supported.
+* stage MSI binding is not supported. Also nested mode is not
+* compatible with MSI HW reserved regions.
 */
if (smmu_domain->stage == ARM_SMMU_DOMAIN_NESTED &&
-   !arm_smmu_share_msi_domain(domain, dev)) {
+   (!arm_smmu_share_msi_domain(domain, dev) ||
+arm_smmu_has_hw_msi_resv_region(dev))) {
ret = -EINVAL;
goto out_unlock;
}
-- 
2.20.1

___
iommu mailing list
iommu@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/iommu


[PATCH v11 13/13] iommu/smmuv3: Report non recoverable faults

2020-04-14 Thread Eric Auger
When a stage 1 related fault event is read from the event queue,
let's propagate it to potential external fault listeners, ie. users
who registered a fault handler.

Signed-off-by: Eric Auger 

---
v8 -> v9:
- adapt to the removal of IOMMU_FAULT_UNRECOV_PERM_VALID:
  only look at IOMMU_FAULT_UNRECOV_ADDR_VALID which comes with
  perm
- do not advertise IOMMU_FAULT_UNRECOV_PASID_VALID faults for
  translation faults
- trace errors if !master
- test nested before calling iommu_report_device_fault
- call the fault handler unconditionnally in non nested mode

v4 -> v5:
- s/IOMMU_FAULT_PERM_INST/IOMMU_FAULT_PERM_EXEC
---
 drivers/iommu/arm-smmu-v3.c | 182 +---
 1 file changed, 171 insertions(+), 11 deletions(-)

diff --git a/drivers/iommu/arm-smmu-v3.c b/drivers/iommu/arm-smmu-v3.c
index 253f96e97c11..ebf0cafe9fd5 100644
--- a/drivers/iommu/arm-smmu-v3.c
+++ b/drivers/iommu/arm-smmu-v3.c
@@ -171,6 +171,26 @@
 #define ARM_SMMU_PRIQ_IRQ_CFG1 0xd8
 #define ARM_SMMU_PRIQ_IRQ_CFG2 0xdc
 
+/* Events */
+#define ARM_SMMU_EVT_F_UUT 0x01
+#define ARM_SMMU_EVT_C_BAD_STREAMID0x02
+#define ARM_SMMU_EVT_F_STE_FETCH   0x03
+#define ARM_SMMU_EVT_C_BAD_STE 0x04
+#define ARM_SMMU_EVT_F_BAD_ATS_TREQ0x05
+#define ARM_SMMU_EVT_F_STREAM_DISABLED 0x06
+#define ARM_SMMU_EVT_F_TRANSL_FORBIDDEN0x07
+#define ARM_SMMU_EVT_C_BAD_SUBSTREAMID 0x08
+#define ARM_SMMU_EVT_F_CD_FETCH0x09
+#define ARM_SMMU_EVT_C_BAD_CD  0x0a
+#define ARM_SMMU_EVT_F_WALK_EABT   0x0b
+#define ARM_SMMU_EVT_F_TRANSLATION 0x10
+#define ARM_SMMU_EVT_F_ADDR_SIZE   0x11
+#define ARM_SMMU_EVT_F_ACCESS  0x12
+#define ARM_SMMU_EVT_F_PERMISSION  0x13
+#define ARM_SMMU_EVT_F_TLB_CONFLICT0x20
+#define ARM_SMMU_EVT_F_CFG_CONFLICT0x21
+#define ARM_SMMU_EVT_E_PAGE_REQUEST0x24
+
 /* Common MSI config fields */
 #define MSI_CFG0_ADDR_MASK GENMASK_ULL(51, 2)
 #define MSI_CFG2_SHGENMASK(5, 4)
@@ -387,6 +407,15 @@
 #define EVTQ_MAX_SZ_SHIFT  (Q_MAX_SZ_SHIFT - EVTQ_ENT_SZ_SHIFT)
 
 #define EVTQ_0_ID  GENMASK_ULL(7, 0)
+#define EVTQ_0_SSV GENMASK_ULL(11, 11)
+#define EVTQ_0_SUBSTREAMID GENMASK_ULL(31, 12)
+#define EVTQ_0_STREAMIDGENMASK_ULL(63, 32)
+#define EVTQ_1_PNU GENMASK_ULL(33, 33)
+#define EVTQ_1_IND GENMASK_ULL(34, 34)
+#define EVTQ_1_RNW GENMASK_ULL(35, 35)
+#define EVTQ_1_S2  GENMASK_ULL(39, 39)
+#define EVTQ_1_CLASS   GENMASK_ULL(40, 41)
+#define EVTQ_3_FETCH_ADDR  GENMASK_ULL(51, 3)
 
 /* PRI queue */
 #define PRIQ_ENT_SZ_SHIFT  4
@@ -730,6 +759,57 @@ struct arm_smmu_domain {
spinlock_t  devices_lock;
 };
 
+/* fault propagation */
+struct arm_smmu_fault_propagation_data {
+   enum iommu_fault_reason reason;
+   bool s1_check;
+   u32 fields; /* IOMMU_FAULT_UNRECOV_*_VALID bits */
+};
+
+/*
+ * Describes how SMMU faults translate into generic IOMMU faults
+ * and if they need to be reported externally
+ */
+static const struct arm_smmu_fault_propagation_data fault_propagation[] = {
+[ARM_SMMU_EVT_F_UUT]   = { },
+[ARM_SMMU_EVT_C_BAD_STREAMID]  = { },
+[ARM_SMMU_EVT_F_STE_FETCH] = { },
+[ARM_SMMU_EVT_C_BAD_STE]   = { },
+[ARM_SMMU_EVT_F_BAD_ATS_TREQ]  = { },
+[ARM_SMMU_EVT_F_STREAM_DISABLED]   = { },
+[ARM_SMMU_EVT_F_TRANSL_FORBIDDEN]  = { },
+[ARM_SMMU_EVT_C_BAD_SUBSTREAMID]   = {IOMMU_FAULT_REASON_PASID_INVALID,
+  false,
+  IOMMU_FAULT_UNRECOV_PASID_VALID
+ },
+[ARM_SMMU_EVT_F_CD_FETCH]  = {IOMMU_FAULT_REASON_PASID_FETCH,
+  false,
+  IOMMU_FAULT_UNRECOV_FETCH_ADDR_VALID
+ },
+[ARM_SMMU_EVT_C_BAD_CD]= 
{IOMMU_FAULT_REASON_BAD_PASID_ENTRY,
+  false,
+ },
+[ARM_SMMU_EVT_F_WALK_EABT] = {IOMMU_FAULT_REASON_WALK_EABT, true,
+  IOMMU_FAULT_UNRECOV_ADDR_VALID |
+  IOMMU_FAULT_UNRECOV_FETCH_ADDR_VALID
+ },
+[ARM_SMMU_EVT_F_TRANSLATION]   = {IOMMU_FAULT_REASON_PTE_FETCH, true,
+  IOMMU_FAULT_UNRECOV_ADDR_VALID
+ },
+[ARM_SMMU_EVT_F_ADDR_SIZE] = {IOMMU_FAULT_REASON_OOR_ADDRESS, true,
+  IOMMU_FAULT_UNRECOV_ADDR_VALID
+ },

[PATCH v11 10/13] iommu/smmuv3: Nested mode single MSI doorbell per domain enforcement

2020-04-14 Thread Eric Auger
In nested mode we enforce the rule that all devices belonging
to the same iommu_domain share the same msi_domain.

Indeed if there were several physical MSI doorbells being used
within a single iommu_domain, it becomes really difficult to
resolve the nested stage mapping translating into the correct
physical doorbell. So let's forbid this situation.

Signed-off-by: Eric Auger 
---
 drivers/iommu/arm-smmu-v3.c | 41 +
 1 file changed, 41 insertions(+)

diff --git a/drivers/iommu/arm-smmu-v3.c b/drivers/iommu/arm-smmu-v3.c
index 38854c3e4083..f157d1de614b 100644
--- a/drivers/iommu/arm-smmu-v3.c
+++ b/drivers/iommu/arm-smmu-v3.c
@@ -2896,6 +2896,37 @@ static void arm_smmu_detach_dev(struct arm_smmu_master 
*master)
arm_smmu_install_ste_for_dev(master);
 }
 
+static bool arm_smmu_share_msi_domain(struct iommu_domain *domain,
+ struct device *dev)
+{
+   struct arm_smmu_domain *smmu_domain = to_smmu_domain(domain);
+   struct irq_domain *irqd = dev_get_msi_domain(dev);
+   struct arm_smmu_master *master;
+   unsigned long flags;
+   bool share = false;
+
+   if (!irqd)
+   return true;
+
+   spin_lock_irqsave(_domain->devices_lock, flags);
+   list_for_each_entry(master, _domain->devices, domain_head) {
+   struct irq_domain *d = dev_get_msi_domain(master->dev);
+
+   if (!d)
+   continue;
+   if (irqd != d) {
+   dev_info(dev, "Nested mode forbids to attach devices "
+"using different physical MSI doorbells "
+"to the same iommu_domain");
+   goto unlock;
+   }
+   }
+   share = true;
+unlock:
+   spin_unlock_irqrestore(_domain->devices_lock, flags);
+   return share;
+}
+
 static int arm_smmu_attach_dev(struct iommu_domain *domain, struct device *dev)
 {
int ret = 0;
@@ -2937,6 +2968,16 @@ static int arm_smmu_attach_dev(struct iommu_domain 
*domain, struct device *dev)
ret = -EINVAL;
goto out_unlock;
}
+   /*
+* In nested mode we must check all devices belonging to the
+* domain share the same physical MSI doorbell. Otherwise nested
+* stage MSI binding is not supported.
+*/
+   if (smmu_domain->stage == ARM_SMMU_DOMAIN_NESTED &&
+   !arm_smmu_share_msi_domain(domain, dev)) {
+   ret = -EINVAL;
+   goto out_unlock;
+   }
 
master->domain = smmu_domain;
 
-- 
2.20.1

___
iommu mailing list
iommu@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/iommu


[PATCH v11 12/13] iommu/smmuv3: Implement bind/unbind_guest_msi

2020-04-14 Thread Eric Auger
The bind/unbind_guest_msi() callbacks check the domain
is NESTED and redirect to the dma-iommu implementation.

Signed-off-by: Eric Auger 

---

v6 -> v7:
- remove device handle argument
---
 drivers/iommu/arm-smmu-v3.c | 43 +
 1 file changed, 43 insertions(+)

diff --git a/drivers/iommu/arm-smmu-v3.c b/drivers/iommu/arm-smmu-v3.c
index f4c793649152..253f96e97c11 100644
--- a/drivers/iommu/arm-smmu-v3.c
+++ b/drivers/iommu/arm-smmu-v3.c
@@ -3377,6 +3377,47 @@ static void arm_smmu_get_resv_regions(struct device *dev,
iommu_dma_get_resv_regions(dev, head);
 }
 
+static int
+arm_smmu_bind_guest_msi(struct iommu_domain *domain,
+   dma_addr_t giova, phys_addr_t gpa, size_t size)
+{
+   struct arm_smmu_domain *smmu_domain = to_smmu_domain(domain);
+   struct arm_smmu_device *smmu;
+   int ret = -EINVAL;
+
+   mutex_lock(_domain->init_mutex);
+   smmu = smmu_domain->smmu;
+   if (!smmu)
+   goto out;
+
+   if (smmu_domain->stage != ARM_SMMU_DOMAIN_NESTED)
+   goto out;
+
+   ret = iommu_dma_bind_guest_msi(domain, giova, gpa, size);
+out:
+   mutex_unlock(_domain->init_mutex);
+   return ret;
+}
+
+static void
+arm_smmu_unbind_guest_msi(struct iommu_domain *domain, dma_addr_t giova)
+{
+   struct arm_smmu_domain *smmu_domain = to_smmu_domain(domain);
+   struct arm_smmu_device *smmu;
+
+   mutex_lock(_domain->init_mutex);
+   smmu = smmu_domain->smmu;
+   if (!smmu)
+   goto unlock;
+
+   if (smmu_domain->stage != ARM_SMMU_DOMAIN_NESTED)
+   goto unlock;
+
+   iommu_dma_unbind_guest_msi(domain, giova);
+unlock:
+   mutex_unlock(_domain->init_mutex);
+}
+
 static int arm_smmu_attach_pasid_table(struct iommu_domain *domain,
   struct iommu_pasid_table_config *cfg)
 {
@@ -3546,6 +3587,8 @@ static struct iommu_ops arm_smmu_ops = {
.attach_pasid_table = arm_smmu_attach_pasid_table,
.detach_pasid_table = arm_smmu_detach_pasid_table,
.cache_invalidate   = arm_smmu_cache_invalidate,
+   .bind_guest_msi = arm_smmu_bind_guest_msi,
+   .unbind_guest_msi   = arm_smmu_unbind_guest_msi,
.pgsize_bitmap  = -1UL, /* Restricted during device attach */
 };
 
-- 
2.20.1

___
iommu mailing list
iommu@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/iommu


[PATCH v11 03/13] iommu/arm-smmu-v3: Maintain a SID->device structure

2020-04-14 Thread Eric Auger
From: Jean-Philippe Brucker 

When handling faults from the event or PRI queue, we need to find the
struct device associated to a SID. Add a rb_tree to keep track of SIDs.

Signed-off-by: Eric Auger 
Signed-off-by: Jean-Philippe Brucker 
---
 drivers/iommu/arm-smmu-v3.c | 112 +++-
 1 file changed, 111 insertions(+), 1 deletion(-)

diff --git a/drivers/iommu/arm-smmu-v3.c b/drivers/iommu/arm-smmu-v3.c
index 82508730feb7..ac7009348749 100644
--- a/drivers/iommu/arm-smmu-v3.c
+++ b/drivers/iommu/arm-smmu-v3.c
@@ -677,6 +677,16 @@ struct arm_smmu_device {
 
/* IOMMU core code handle */
struct iommu_device iommu;
+
+   struct rb_root  streams;
+   struct mutexstreams_mutex;
+
+};
+
+struct arm_smmu_stream {
+   u32 id;
+   struct arm_smmu_master  *master;
+   struct rb_node  node;
 };
 
 /* SMMU private data for each master */
@@ -687,6 +697,7 @@ struct arm_smmu_master {
struct list_headdomain_head;
u32 *sids;
unsigned intnum_sids;
+   struct arm_smmu_stream  *streams;
boolats_enabled;
unsigned intssid_bits;
 };
@@ -1967,6 +1978,32 @@ static int arm_smmu_init_l2_strtab(struct 
arm_smmu_device *smmu, u32 sid)
return 0;
 }
 
+__maybe_unused
+static struct arm_smmu_master *
+arm_smmu_find_master(struct arm_smmu_device *smmu, u32 sid)
+{
+   struct rb_node *node;
+   struct arm_smmu_stream *stream;
+   struct arm_smmu_master *master = NULL;
+
+   mutex_lock(>streams_mutex);
+   node = smmu->streams.rb_node;
+   while (node) {
+   stream = rb_entry(node, struct arm_smmu_stream, node);
+   if (stream->id < sid) {
+   node = node->rb_right;
+   } else if (stream->id > sid) {
+   node = node->rb_left;
+   } else {
+   master = stream->master;
+   break;
+   }
+   }
+   mutex_unlock(>streams_mutex);
+
+   return master;
+}
+
 /* IRQ and event handlers */
 static irqreturn_t arm_smmu_evtq_thread(int irq, void *dev)
 {
@@ -2912,6 +2949,69 @@ static bool arm_smmu_sid_in_range(struct arm_smmu_device 
*smmu, u32 sid)
return sid < limit;
 }
 
+static int arm_smmu_insert_master(struct arm_smmu_device *smmu,
+ struct arm_smmu_master *master)
+{
+   int i;
+   int ret = 0;
+   struct arm_smmu_stream *new_stream, *cur_stream;
+   struct rb_node **new_node, *parent_node = NULL;
+
+   master->streams = kcalloc(master->num_sids,
+ sizeof(struct arm_smmu_stream), GFP_KERNEL);
+   if (!master->streams)
+   return -ENOMEM;
+
+   mutex_lock(>streams_mutex);
+   for (i = 0; i < master->num_sids && !ret; i++) {
+   new_stream = >streams[i];
+   new_stream->id = master->sids[i];
+   new_stream->master = master;
+
+   new_node = &(smmu->streams.rb_node);
+   while (*new_node) {
+   cur_stream = rb_entry(*new_node, struct arm_smmu_stream,
+ node);
+   parent_node = *new_node;
+   if (cur_stream->id > new_stream->id) {
+   new_node = &((*new_node)->rb_left);
+   } else if (cur_stream->id < new_stream->id) {
+   new_node = &((*new_node)->rb_right);
+   } else {
+   dev_warn(master->dev,
+"stream %u already in tree\n",
+cur_stream->id);
+   ret = -EINVAL;
+   break;
+   }
+   }
+
+   if (!ret) {
+   rb_link_node(_stream->node, parent_node, new_node);
+   rb_insert_color(_stream->node, >streams);
+   }
+   }
+   mutex_unlock(>streams_mutex);
+
+   return ret;
+}
+
+static void arm_smmu_remove_master(struct arm_smmu_device *smmu,
+  struct arm_smmu_master *master)
+{
+   int i;
+
+   if (!master->streams)
+   return;
+
+   mutex_lock(>streams_mutex);
+   for (i = 0; i < master->num_sids; i++)
+   rb_erase(>streams[i].node, >streams);
+   mutex_unlock(>streams_mutex);
+
+   kfree(master->streams);
+}
+
 static struct iommu_ops arm_smmu_ops;
 
 static int arm_smmu_add_device(struct device *dev)
@@ -2979,15 +3079,21 @@ static int arm_smmu_add_device(struct device *dev)
if (ret)
goto err_disable_pasid;
 
+ 

[PATCH v11 09/13] dma-iommu: Implement NESTED_MSI cookie

2020-04-14 Thread Eric Auger
Up to now, when the type was UNMANAGED, we used to
allocate IOVA pages within a reserved IOVA MSI range.

If both the host and the guest are exposed with SMMUs, each
would allocate an IOVA. The guest allocates an IOVA (gIOVA)
to map onto the guest MSI doorbell (gDB). The Host allocates
another IOVA (hIOVA) to map onto the physical doorbell (hDB).

So we end up with 2 unrelated mappings, at S1 and S2:
 S1 S2
gIOVA-> gDB
   hIOVA->hDB

The PCI device would be programmed with hIOVA.
No stage 1 mapping would existing, causing the MSIs to fault.

iommu_dma_bind_guest_msi() allows to pass gIOVA/gDB
to the host so that gIOVA can be used by the host instead of
re-allocating a new hIOVA.

 S1   S2
gIOVA->gDB->hDB

this time, the PCI device can be programmed with the gIOVA MSI
doorbell which is correctly mapped through both stages.

Nested mode is not compatible with HW MSI regions as in that
case gDB and hDB should have a 1-1 mapping. This check will
be done when attaching each device to the IOMMU domain.

Signed-off-by: Eric Auger 

---

v10 -> v11:
- fix compilation if !CONFIG_IOMMU_DMA

v7 -> v8:
- correct iommu_dma_(un)bind_guest_msi when
  !CONFIG_IOMMU_DMA
- Mentioned nested mode is not compatible with HW MSI regions
  in commit message
- protect with msi_lock on unbind

v6 -> v7:
- removed device handle

v3 -> v4:
- change function names; add unregister
- protect with msi_lock

v2 -> v3:
- also store the device handle on S1 mapping registration.
  This garantees we associate the associated S2 mapping binds
  to the correct physical MSI controller.

v1 -> v2:
- unmap stage2 on put()
---
 drivers/iommu/dma-iommu.c | 142 +-
 include/linux/dma-iommu.h |  16 +
 2 files changed, 155 insertions(+), 3 deletions(-)

diff --git a/drivers/iommu/dma-iommu.c b/drivers/iommu/dma-iommu.c
index ba128d1cdaee..f25297bd09d6 100644
--- a/drivers/iommu/dma-iommu.c
+++ b/drivers/iommu/dma-iommu.c
@@ -20,6 +20,7 @@
 #include 
 #include 
 #include 
+#include 
 #include 
 #include 
 #include 
@@ -28,12 +29,15 @@
 struct iommu_dma_msi_page {
struct list_headlist;
dma_addr_t  iova;
+   dma_addr_t  gpa;
phys_addr_t phys;
+   size_t  s1_granule;
 };
 
 enum iommu_dma_cookie_type {
IOMMU_DMA_IOVA_COOKIE,
IOMMU_DMA_MSI_COOKIE,
+   IOMMU_DMA_NESTED_MSI_COOKIE,
 };
 
 struct iommu_dma_cookie {
@@ -45,6 +49,7 @@ struct iommu_dma_cookie {
dma_addr_t  msi_iova;
};
struct list_headmsi_page_list;
+   spinlock_t  msi_lock;
 
/* Domain for flush queue callback; NULL if flush queue not in use */
struct iommu_domain *fq_domain;
@@ -63,6 +68,7 @@ static struct iommu_dma_cookie *cookie_alloc(enum 
iommu_dma_cookie_type type)
 
cookie = kzalloc(sizeof(*cookie), GFP_KERNEL);
if (cookie) {
+   spin_lock_init(>msi_lock);
INIT_LIST_HEAD(>msi_page_list);
cookie->type = type;
}
@@ -96,14 +102,17 @@ EXPORT_SYMBOL(iommu_get_dma_cookie);
  *
  * Users who manage their own IOVA allocation and do not want DMA API support,
  * but would still like to take advantage of automatic MSI remapping, can use
- * this to initialise their own domain appropriately. Users should reserve a
+ * this to initialise their own domain appropriately. Users may reserve a
  * contiguous IOVA region, starting at @base, large enough to accommodate the
  * number of PAGE_SIZE mappings necessary to cover every MSI doorbell address
- * used by the devices attached to @domain.
+ * used by the devices attached to @domain. The other way round is to provide
+ * usable iova pages through the iommu_dma_bind_doorbell API (nested stages
+ * use case)
  */
 int iommu_get_msi_cookie(struct iommu_domain *domain, dma_addr_t base)
 {
struct iommu_dma_cookie *cookie;
+   int nesting, ret;
 
if (domain->type != IOMMU_DOMAIN_UNMANAGED)
return -EINVAL;
@@ -111,7 +120,12 @@ int iommu_get_msi_cookie(struct iommu_domain *domain, 
dma_addr_t base)
if (domain->iova_cookie)
return -EEXIST;
 
-   cookie = cookie_alloc(IOMMU_DMA_MSI_COOKIE);
+   ret =  iommu_domain_get_attr(domain, DOMAIN_ATTR_NESTING, );
+   if (!ret && nesting)
+   cookie = cookie_alloc(IOMMU_DMA_NESTED_MSI_COOKIE);
+   else
+   cookie = cookie_alloc(IOMMU_DMA_MSI_COOKIE);
+
if (!cookie)
return -ENOMEM;
 
@@ -132,6 +146,7 @@ void iommu_put_dma_cookie(struct iommu_domain *domain)
 {
struct iommu_dma_cookie *cookie = domain->iova_cookie;
struct iommu_dma_msi_page *msi, *tmp;
+   bool s2_unmap = false;
 
if (!cookie)
return;
@@ -139,7 +154,15 @@ void iommu_put_dma_cookie(struct 

[PATCH v11 05/13] iommu/smmuv3: Get prepared for nested stage support

2020-04-14 Thread Eric Auger
When nested stage translation is setup, both s1_cfg and
s2_cfg are allocated.

We introduce a new smmu domain abort field that will be set
upon guest stage1 configuration passing.

arm_smmu_write_strtab_ent() is modified to write both stage
fields in the STE and deal with the abort field.

In nested mode, only stage 2 is "finalized" as the host does
not own/configure the stage 1 context descriptor; guest does.

Signed-off-by: Eric Auger 

---
v10 -> v11:
- Fix an issue reported by Shameer when switching from with vSMMU
  to without vSMMU. Despite the spec does not seem to mention it
  seems to be needed to reset the 2 high 64b when switching from
  S1+S2 cfg to S1 only. Especially dst[3] needs to be reset (S2TTB).
  On some implementations, if the S2TTB is not reset, this causes
  a C_BAD_STE error

v7 -> v8:
- rebase on 8be39a1a04c1 iommu/arm-smmu-v3: Add a master->domain
  pointer
- restore live checks for not nested cases and add s1_live and
  s2_live to be more previse. Remove bypass local variable.
  In STE live case, move the ste to abort state and send a
  CFGI_STE before updating the rest of the fields.
- check s2ttb in case of live s2

v4 -> v5:
- reset ste.abort on detach

v3 -> v4:
- s1_cfg.nested_abort and nested_bypass removed.
- s/ste.nested/ste.abort
- arm_smmu_write_strtab_ent modifications with introduction
  of local abort, bypass and translate local variables
- comment updated
---
 drivers/iommu/arm-smmu-v3.c | 68 +++--
 1 file changed, 58 insertions(+), 10 deletions(-)

diff --git a/drivers/iommu/arm-smmu-v3.c b/drivers/iommu/arm-smmu-v3.c
index da3739bb7323..dd3c12034e84 100644
--- a/drivers/iommu/arm-smmu-v3.c
+++ b/drivers/iommu/arm-smmu-v3.c
@@ -223,6 +223,7 @@
 #define STRTAB_STE_0_CFG_BYPASS4
 #define STRTAB_STE_0_CFG_S1_TRANS  5
 #define STRTAB_STE_0_CFG_S2_TRANS  6
+#define STRTAB_STE_0_CFG_NESTED7
 
 #define STRTAB_STE_0_S1FMT GENMASK_ULL(5, 4)
 #define STRTAB_STE_0_S1FMT_LINEAR  0
@@ -721,6 +722,7 @@ struct arm_smmu_domain {
enum arm_smmu_domain_stage  stage;
struct arm_smmu_s1_cfg  *s1_cfg;
struct arm_smmu_s2_cfg  *s2_cfg;
+   boolabort;
 
struct iommu_domain domain;
 
@@ -1807,8 +1809,10 @@ static void arm_smmu_write_strtab_ent(struct 
arm_smmu_master *master, u32 sid,
 * three cases at the moment:
 *
 * 1. Invalid (all zero) -> bypass/fault (init)
-* 2. Bypass/fault -> translation/bypass (attach)
-* 3. Translation/bypass -> bypass/fault (detach)
+* 2. Bypass/fault -> single stage translation/bypass (attach)
+* 3. Single or nested stage Translation/bypass -> bypass/fault (detach)
+* 4. S2 -> S1 + S2 (attach_pasid_table)
+* 5. S1 + S2 -> S2 (detach_pasid_table)
 *
 * Given that we can't update the STE atomically and the SMMU
 * doesn't read the thing in a defined order, that leaves us
@@ -1819,7 +1823,8 @@ static void arm_smmu_write_strtab_ent(struct 
arm_smmu_master *master, u32 sid,
 * 3. Update Config, sync
 */
u64 val = le64_to_cpu(dst[0]);
-   bool ste_live = false;
+   bool abort, translate, s1_live = false, s2_live = false, ste_live;
+   bool nested = false;
struct arm_smmu_device *smmu = NULL;
struct arm_smmu_s1_cfg *s1_cfg = NULL;
struct arm_smmu_s2_cfg *s2_cfg = NULL;
@@ -1839,6 +1844,7 @@ static void arm_smmu_write_strtab_ent(struct 
arm_smmu_master *master, u32 sid,
if (smmu_domain) {
s1_cfg = smmu_domain->s1_cfg;
s2_cfg = smmu_domain->s2_cfg;
+   nested = (smmu_domain->stage == ARM_SMMU_DOMAIN_NESTED);
}
 
if (val & STRTAB_STE_0_V) {
@@ -1846,23 +1852,37 @@ static void arm_smmu_write_strtab_ent(struct 
arm_smmu_master *master, u32 sid,
case STRTAB_STE_0_CFG_BYPASS:
break;
case STRTAB_STE_0_CFG_S1_TRANS:
+   s1_live = true;
+   break;
case STRTAB_STE_0_CFG_S2_TRANS:
-   ste_live = true;
+   s2_live = true;
+   break;
+   case STRTAB_STE_0_CFG_NESTED:
+   s1_live = true;
+   s2_live = true;
break;
case STRTAB_STE_0_CFG_ABORT:
-   BUG_ON(!disable_bypass);
break;
default:
BUG(); /* STE corruption */
}
}
 
+   ste_live = s1_live || s2_live;
+
/* Nuke the existing STE_0 value, as we're going to rewrite it */
val = STRTAB_STE_0_V;
 
/* Bypass/fault */
-   if (!smmu_domain || !(s1_cfg || s2_cfg)) {
-   if (!smmu_domain && disable_bypass)
+
+   if 

[PATCH v11 08/13] iommu/smmuv3: Implement cache_invalidate

2020-04-14 Thread Eric Auger
Implement domain-selective and page-selective IOTLB invalidations.

Signed-off-by: Eric Auger 

---
v7 -> v8:
- ASID based invalidation using iommu_inv_pasid_info
- check ARCHID/PASID flags in addr based invalidation
- use __arm_smmu_tlb_inv_context and __arm_smmu_tlb_inv_range_nosync

v6 -> v7
- check the uapi version

v3 -> v4:
- adapt to changes in the uapi
- add support for leaf parameter
- do not use arm_smmu_tlb_inv_range_nosync or arm_smmu_tlb_inv_context
  anymore

v2 -> v3:
- replace __arm_smmu_tlb_sync by arm_smmu_cmdq_issue_sync

v1 -> v2:
- properly pass the asid
---
 drivers/iommu/arm-smmu-v3.c | 53 +
 1 file changed, 53 insertions(+)

diff --git a/drivers/iommu/arm-smmu-v3.c b/drivers/iommu/arm-smmu-v3.c
index 4ec2106be301..38854c3e4083 100644
--- a/drivers/iommu/arm-smmu-v3.c
+++ b/drivers/iommu/arm-smmu-v3.c
@@ -3413,6 +3413,58 @@ static void arm_smmu_detach_pasid_table(struct 
iommu_domain *domain)
mutex_unlock(_domain->init_mutex);
 }
 
+static int
+arm_smmu_cache_invalidate(struct iommu_domain *domain, struct device *dev,
+ struct iommu_cache_invalidate_info *inv_info)
+{
+   struct arm_smmu_domain *smmu_domain = to_smmu_domain(domain);
+   struct arm_smmu_device *smmu = smmu_domain->smmu;
+
+   if (smmu_domain->stage != ARM_SMMU_DOMAIN_NESTED)
+   return -EINVAL;
+
+   if (!smmu)
+   return -EINVAL;
+
+   if (inv_info->version != IOMMU_CACHE_INVALIDATE_INFO_VERSION_1)
+   return -EINVAL;
+
+   if (inv_info->cache & IOMMU_CACHE_INV_TYPE_IOTLB) {
+   if (inv_info->granularity == IOMMU_INV_GRANU_PASID) {
+   struct iommu_inv_pasid_info *info =
+   _info->pasid_info;
+
+   if (!(info->flags & IOMMU_INV_PASID_FLAGS_ARCHID) ||
+(info->flags & IOMMU_INV_PASID_FLAGS_PASID))
+   return -EINVAL;
+
+   __arm_smmu_tlb_inv_context(smmu_domain, info->archid);
+
+   } else if (inv_info->granularity == IOMMU_INV_GRANU_ADDR) {
+   struct iommu_inv_addr_info *info = _info->addr_info;
+   size_t size = info->nb_granules * info->granule_size;
+   bool leaf = info->flags & IOMMU_INV_ADDR_FLAGS_LEAF;
+
+   if (!(info->flags & IOMMU_INV_ADDR_FLAGS_ARCHID) ||
+(info->flags & IOMMU_INV_ADDR_FLAGS_PASID))
+   return -EINVAL;
+
+   __arm_smmu_tlb_inv_range(info->addr, size,
+info->granule_size, leaf,
+ smmu_domain, info->archid);
+
+   arm_smmu_cmdq_issue_sync(smmu);
+   } else {
+   return -EINVAL;
+   }
+   }
+   if (inv_info->cache & IOMMU_CACHE_INV_TYPE_PASID ||
+   inv_info->cache & IOMMU_CACHE_INV_TYPE_DEV_IOTLB) {
+   return -ENOENT;
+   }
+   return 0;
+}
+
 static struct iommu_ops arm_smmu_ops = {
.capable= arm_smmu_capable,
.domain_alloc   = arm_smmu_domain_alloc,
@@ -3433,6 +3485,7 @@ static struct iommu_ops arm_smmu_ops = {
.put_resv_regions   = generic_iommu_put_resv_regions,
.attach_pasid_table = arm_smmu_attach_pasid_table,
.detach_pasid_table = arm_smmu_detach_pasid_table,
+   .cache_invalidate   = arm_smmu_cache_invalidate,
.pgsize_bitmap  = -1UL, /* Restricted during device attach */
 };
 
-- 
2.20.1

___
iommu mailing list
iommu@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/iommu


[PATCH v11 06/13] iommu/smmuv3: Implement attach/detach_pasid_table

2020-04-14 Thread Eric Auger
On attach_pasid_table() we program STE S1 related info set
by the guest into the actual physical STEs. At minimum
we need to program the context descriptor GPA and compute
whether the stage1 is translated/bypassed or aborted.

Signed-off-by: Eric Auger 

---
v7 -> v8:
- remove smmu->features check, now done on domain finalize

v6 -> v7:
- check versions and comment the fact we don't need to take
  into account s1dss and s1fmt
v3 -> v4:
- adapt to changes in iommu_pasid_table_config
- different programming convention at s1_cfg/s2_cfg/ste.abort

v2 -> v3:
- callback now is named set_pasid_table and struct fields
  are laid out differently.

v1 -> v2:
- invalidate the STE before changing them
- hold init_mutex
- handle new fields
---
 drivers/iommu/arm-smmu-v3.c | 98 +
 1 file changed, 98 insertions(+)

diff --git a/drivers/iommu/arm-smmu-v3.c b/drivers/iommu/arm-smmu-v3.c
index dd3c12034e84..21bcf2536320 100644
--- a/drivers/iommu/arm-smmu-v3.c
+++ b/drivers/iommu/arm-smmu-v3.c
@@ -3293,6 +3293,102 @@ static void arm_smmu_get_resv_regions(struct device 
*dev,
iommu_dma_get_resv_regions(dev, head);
 }
 
+static int arm_smmu_attach_pasid_table(struct iommu_domain *domain,
+  struct iommu_pasid_table_config *cfg)
+{
+   struct arm_smmu_domain *smmu_domain = to_smmu_domain(domain);
+   struct arm_smmu_master *master;
+   struct arm_smmu_device *smmu;
+   unsigned long flags;
+   int ret = -EINVAL;
+
+   if (cfg->format != IOMMU_PASID_FORMAT_SMMUV3)
+   return -EINVAL;
+
+   if (cfg->version != PASID_TABLE_CFG_VERSION_1 ||
+   cfg->smmuv3.version != PASID_TABLE_SMMUV3_CFG_VERSION_1)
+   return -EINVAL;
+
+   mutex_lock(_domain->init_mutex);
+
+   smmu = smmu_domain->smmu;
+
+   if (!smmu)
+   goto out;
+
+   if (smmu_domain->stage != ARM_SMMU_DOMAIN_NESTED)
+   goto out;
+
+   switch (cfg->config) {
+   case IOMMU_PASID_CONFIG_ABORT:
+   kfree(smmu_domain->s1_cfg);
+   smmu_domain->s1_cfg = NULL;
+   smmu_domain->abort = true;
+   break;
+   case IOMMU_PASID_CONFIG_BYPASS:
+   kfree(smmu_domain->s1_cfg);
+   smmu_domain->s1_cfg = NULL;
+   smmu_domain->abort = false;
+   break;
+   case IOMMU_PASID_CONFIG_TRANSLATE:
+   /* we do not support S1 <-> S1 transitions */
+   if (smmu_domain->s1_cfg)
+   goto out;
+
+   /*
+* we currently support a single CD so s1fmt and s1dss
+* fields are also ignored
+*/
+   if (cfg->pasid_bits)
+   goto out;
+
+   smmu_domain->s1_cfg = kzalloc(sizeof(*smmu_domain->s1_cfg),
+ GFP_KERNEL);
+   if (!smmu_domain->s1_cfg) {
+   ret = -ENOMEM;
+   goto out;
+   }
+
+   smmu_domain->s1_cfg->cdcfg.cdtab_dma = cfg->base_ptr;
+   smmu_domain->abort = false;
+   break;
+   default:
+   goto out;
+   }
+   spin_lock_irqsave(_domain->devices_lock, flags);
+   list_for_each_entry(master, _domain->devices, domain_head)
+   arm_smmu_install_ste_for_dev(master);
+   spin_unlock_irqrestore(_domain->devices_lock, flags);
+   ret = 0;
+out:
+   mutex_unlock(_domain->init_mutex);
+   return ret;
+}
+
+static void arm_smmu_detach_pasid_table(struct iommu_domain *domain)
+{
+   struct arm_smmu_domain *smmu_domain = to_smmu_domain(domain);
+   struct arm_smmu_master *master;
+   unsigned long flags;
+
+   mutex_lock(_domain->init_mutex);
+
+   if (smmu_domain->stage != ARM_SMMU_DOMAIN_NESTED)
+   goto unlock;
+
+   kfree(smmu_domain->s1_cfg);
+   smmu_domain->s1_cfg = NULL;
+   smmu_domain->abort = true;
+
+   spin_lock_irqsave(_domain->devices_lock, flags);
+   list_for_each_entry(master, _domain->devices, domain_head)
+   arm_smmu_install_ste_for_dev(master);
+   spin_unlock_irqrestore(_domain->devices_lock, flags);
+
+unlock:
+   mutex_unlock(_domain->init_mutex);
+}
+
 static struct iommu_ops arm_smmu_ops = {
.capable= arm_smmu_capable,
.domain_alloc   = arm_smmu_domain_alloc,
@@ -3311,6 +3407,8 @@ static struct iommu_ops arm_smmu_ops = {
.of_xlate   = arm_smmu_of_xlate,
.get_resv_regions   = arm_smmu_get_resv_regions,
.put_resv_regions   = generic_iommu_put_resv_regions,
+   .attach_pasid_table = arm_smmu_attach_pasid_table,
+   .detach_pasid_table = arm_smmu_detach_pasid_table,
.pgsize_bitmap  = -1UL, /* Restricted during device attach */
 };
 
-- 
2.20.1


[PATCH v11 07/13] iommu/smmuv3: Allow stage 1 invalidation with unmanaged ASIDs

2020-04-14 Thread Eric Auger
With nested stage support, soon we will need to invalidate
S1 contexts and ranges tagged with an unmanaged asid, this
latter being managed by the guest. So let's introduce 2 helpers
that allow to invalidate with externally managed ASIDs

Signed-off-by: Eric Auger 
---
 drivers/iommu/arm-smmu-v3.c | 36 ++--
 1 file changed, 30 insertions(+), 6 deletions(-)

diff --git a/drivers/iommu/arm-smmu-v3.c b/drivers/iommu/arm-smmu-v3.c
index 21bcf2536320..4ec2106be301 100644
--- a/drivers/iommu/arm-smmu-v3.c
+++ b/drivers/iommu/arm-smmu-v3.c
@@ -2307,13 +2307,18 @@ static int arm_smmu_atc_inv_domain(struct 
arm_smmu_domain *smmu_domain,
 }
 
 /* IO_PGTABLE API */
-static void arm_smmu_tlb_inv_context(void *cookie)
+
+static void __arm_smmu_tlb_inv_context(struct arm_smmu_domain *smmu_domain,
+  int ext_asid)
 {
-   struct arm_smmu_domain *smmu_domain = cookie;
struct arm_smmu_device *smmu = smmu_domain->smmu;
struct arm_smmu_cmdq_ent cmd;
 
-   if (smmu_domain->stage == ARM_SMMU_DOMAIN_S1) {
+   if (ext_asid >= 0) { /* guest stage 1 invalidation */
+   cmd.opcode  = CMDQ_OP_TLBI_NH_ASID;
+   cmd.tlbi.asid   = ext_asid;
+   cmd.tlbi.vmid   = smmu_domain->s2_cfg->vmid;
+   } else if (smmu_domain->stage == ARM_SMMU_DOMAIN_S1) {
cmd.opcode  = CMDQ_OP_TLBI_NH_ASID;
cmd.tlbi.asid   = smmu_domain->s1_cfg->cd.asid;
cmd.tlbi.vmid   = 0;
@@ -2334,9 +2339,17 @@ static void arm_smmu_tlb_inv_context(void *cookie)
arm_smmu_atc_inv_domain(smmu_domain, 0, 0, 0);
 }
 
-static void arm_smmu_tlb_inv_range(unsigned long iova, size_t size,
+static void arm_smmu_tlb_inv_context(void *cookie)
+{
+   struct arm_smmu_domain *smmu_domain = cookie;
+
+   __arm_smmu_tlb_inv_context(smmu_domain, -1);
+}
+
+static void __arm_smmu_tlb_inv_range(unsigned long iova, size_t size,
   size_t granule, bool leaf,
-  struct arm_smmu_domain *smmu_domain)
+  struct arm_smmu_domain *smmu_domain,
+  int ext_asid)
 {
struct arm_smmu_device *smmu = smmu_domain->smmu;
unsigned long start = iova, end = iova + size, num_pages = 0, tg = 0;
@@ -2351,7 +2364,11 @@ static void arm_smmu_tlb_inv_range(unsigned long iova, 
size_t size,
if (!size)
return;
 
-   if (smmu_domain->stage == ARM_SMMU_DOMAIN_S1) {
+   if (ext_asid >= 0) {  /* guest stage 1 invalidation */
+   cmd.opcode  = CMDQ_OP_TLBI_NH_VA;
+   cmd.tlbi.asid   = ext_asid;
+   cmd.tlbi.vmid   = smmu_domain->s2_cfg->vmid;
+   } else if (smmu_domain->stage == ARM_SMMU_DOMAIN_S1) {
cmd.opcode  = CMDQ_OP_TLBI_NH_VA;
cmd.tlbi.asid   = smmu_domain->s1_cfg->cd.asid;
} else {
@@ -2411,6 +2428,13 @@ static void arm_smmu_tlb_inv_range(unsigned long iova, 
size_t size,
arm_smmu_atc_inv_domain(smmu_domain, 0, start, size);
 }
 
+static void arm_smmu_tlb_inv_range(unsigned long iova, size_t size,
+  size_t granule, bool leaf,
+  struct arm_smmu_domain *smmu_domain)
+{
+   __arm_smmu_tlb_inv_range(iova, size, granule, leaf, smmu_domain, -1);
+}
+
 static void arm_smmu_tlb_inv_page_nosync(struct iommu_iotlb_gather *gather,
 unsigned long iova, size_t granule,
 void *cookie)
-- 
2.20.1

___
iommu mailing list
iommu@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/iommu


[PATCH v11 02/13] iommu: Introduce bind/unbind_guest_msi

2020-04-14 Thread Eric Auger
On ARM, MSI are translated by the SMMU. An IOVA is allocated
for each MSI doorbell. If both the host and the guest are exposed
with SMMUs, we end up with 2 different IOVAs allocated by each.
guest allocates an IOVA (gIOVA) to map onto the guest MSI
doorbell (gDB). The Host allocates another IOVA (hIOVA) to map
onto the physical doorbell (hDB).

So we end up with 2 untied mappings:
 S1S2
gIOVA->gDB
  hIOVA->hDB

Currently the PCI device is programmed by the host with hIOVA
as MSI doorbell. So this does not work.

This patch introduces an API to pass gIOVA/gDB to the host so
that gIOVA can be reused by the host instead of re-allocating
a new IOVA. So the goal is to create the following nested mapping:

 S1S2
gIOVA->gDB ->hDB

and program the PCI device with gIOVA MSI doorbell.

In case we have several devices attached to this nested domain
(devices belonging to the same group), they cannot be isolated
on guest side either. So they should also end up in the same domain
on guest side. We will enforce that all the devices attached to
the host iommu domain use the same physical doorbell and similarly
a single virtual doorbell mapping gets registered (1 single
virtual doorbell is used on guest as well).

Signed-off-by: Eric Auger 

---
v7 -> v8:
- dummy iommu_unbind_guest_msi turned into a void function

v6 -> v7:
- remove the device handle parameter.
- Add comments saying there can only be a single MSI binding
  registered per iommu_domain
v5 -> v6:
-fix compile issue when IOMMU_API is not set

v3 -> v4:
- add unbind

v2 -> v3:
- add a struct device handle
---
 drivers/iommu/iommu.c | 37 +
 include/linux/iommu.h | 20 
 2 files changed, 57 insertions(+)

diff --git a/drivers/iommu/iommu.c b/drivers/iommu/iommu.c
index b71ad56f8c99..16068bd4d47b 100644
--- a/drivers/iommu/iommu.c
+++ b/drivers/iommu/iommu.c
@@ -1756,6 +1756,43 @@ static void __iommu_detach_device(struct iommu_domain 
*domain,
trace_detach_device_from_domain(dev);
 }
 
+/**
+ * iommu_bind_guest_msi - Passes the stage1 GIOVA/GPA mapping of a
+ * virtual doorbell
+ *
+ * @domain: iommu domain the stage 1 mapping will be attached to
+ * @iova: iova allocated by the guest
+ * @gpa: guest physical address of the virtual doorbell
+ * @size: granule size used for the mapping
+ *
+ * The associated IOVA can be reused by the host to create a nested
+ * stage2 binding mapping translating into the physical doorbell used
+ * by the devices attached to the domain.
+ *
+ * All devices within the domain must share the same physical doorbell.
+ * A single MSI GIOVA/GPA mapping can be attached to an iommu_domain.
+ */
+
+int iommu_bind_guest_msi(struct iommu_domain *domain,
+dma_addr_t giova, phys_addr_t gpa, size_t size)
+{
+   if (unlikely(!domain->ops->bind_guest_msi))
+   return -ENODEV;
+
+   return domain->ops->bind_guest_msi(domain, giova, gpa, size);
+}
+EXPORT_SYMBOL_GPL(iommu_bind_guest_msi);
+
+void iommu_unbind_guest_msi(struct iommu_domain *domain,
+   dma_addr_t iova)
+{
+   if (unlikely(!domain->ops->unbind_guest_msi))
+   return;
+
+   domain->ops->unbind_guest_msi(domain, iova);
+}
+EXPORT_SYMBOL_GPL(iommu_unbind_guest_msi);
+
 void iommu_detach_device(struct iommu_domain *domain, struct device *dev)
 {
struct iommu_group *group;
diff --git a/include/linux/iommu.h b/include/linux/iommu.h
index 3e1057c3585a..31b3c74f5fe2 100644
--- a/include/linux/iommu.h
+++ b/include/linux/iommu.h
@@ -250,6 +250,8 @@ struct iommu_iotlb_gather {
  * @sva_unbind_gpasid: unbind guest pasid and mm
  * @attach_pasid_table: attach a pasid table
  * @detach_pasid_table: detach the pasid table
+ * @bind_guest_msi: provides a stage1 giova/gpa MSI doorbell mapping
+ * @unbind_guest_msi: withdraw a stage1 giova/gpa MSI doorbell mapping
  * @pgsize_bitmap: bitmap of all possible supported page sizes
  * @owner: Driver module providing these ops
  */
@@ -323,6 +325,10 @@ struct iommu_ops {
 
int (*sva_unbind_gpasid)(struct device *dev, int pasid);
 
+   int (*bind_guest_msi)(struct iommu_domain *domain,
+ dma_addr_t giova, phys_addr_t gpa, size_t size);
+   void (*unbind_guest_msi)(struct iommu_domain *domain, dma_addr_t giova);
+
unsigned long pgsize_bitmap;
struct module *owner;
 };
@@ -454,6 +460,10 @@ extern int iommu_sva_unbind_gpasid(struct iommu_domain 
*domain,
 extern int iommu_attach_pasid_table(struct iommu_domain *domain,
struct iommu_pasid_table_config *cfg);
 extern void iommu_detach_pasid_table(struct iommu_domain *domain);
+extern int iommu_bind_guest_msi(struct iommu_domain *domain,
+   dma_addr_t giova, phys_addr_t gpa, size_t size);
+extern void iommu_unbind_guest_msi(struct iommu_domain *domain,

[PATCH v11 04/13] iommu/smmuv3: Dynamically allocate s1_cfg and s2_cfg

2020-04-14 Thread Eric Auger
In preparation for the introduction of nested stages
let's turn s1_cfg and s2_cfg fields into pointers which are
dynamically allocated depending on the smmu_domain stage.

In nested mode, both stages will coexist and s1_cfg will
be allocated when the guest configuration gets passed.

Signed-off-by: Eric Auger 
---
 drivers/iommu/arm-smmu-v3.c | 94 -
 1 file changed, 52 insertions(+), 42 deletions(-)

diff --git a/drivers/iommu/arm-smmu-v3.c b/drivers/iommu/arm-smmu-v3.c
index ac7009348749..da3739bb7323 100644
--- a/drivers/iommu/arm-smmu-v3.c
+++ b/drivers/iommu/arm-smmu-v3.c
@@ -719,10 +719,8 @@ struct arm_smmu_domain {
atomic_tnr_ats_masters;
 
enum arm_smmu_domain_stage  stage;
-   union {
-   struct arm_smmu_s1_cfg  s1_cfg;
-   struct arm_smmu_s2_cfg  s2_cfg;
-   };
+   struct arm_smmu_s1_cfg  *s1_cfg;
+   struct arm_smmu_s2_cfg  *s2_cfg;
 
struct iommu_domain domain;
 
@@ -1598,9 +1596,9 @@ static __le64 *arm_smmu_get_cd_ptr(struct arm_smmu_domain 
*smmu_domain,
unsigned int idx;
struct arm_smmu_l1_ctx_desc *l1_desc;
struct arm_smmu_device *smmu = smmu_domain->smmu;
-   struct arm_smmu_ctx_desc_cfg *cdcfg = _domain->s1_cfg.cdcfg;
+   struct arm_smmu_ctx_desc_cfg *cdcfg = _domain->s1_cfg->cdcfg;
 
-   if (smmu_domain->s1_cfg.s1fmt == STRTAB_STE_0_S1FMT_LINEAR)
+   if (smmu_domain->s1_cfg->s1fmt == STRTAB_STE_0_S1FMT_LINEAR)
return cdcfg->cdtab + ssid * CTXDESC_CD_DWORDS;
 
idx = ssid >> CTXDESC_SPLIT;
@@ -1635,7 +1633,7 @@ static int arm_smmu_write_ctx_desc(struct arm_smmu_domain 
*smmu_domain,
__le64 *cdptr;
struct arm_smmu_device *smmu = smmu_domain->smmu;
 
-   if (WARN_ON(ssid >= (1 << smmu_domain->s1_cfg.s1cdmax)))
+   if (WARN_ON(ssid >= (1 << smmu_domain->s1_cfg->s1cdmax)))
return -E2BIG;
 
cdptr = arm_smmu_get_cd_ptr(smmu_domain, ssid);
@@ -1700,7 +1698,7 @@ static int arm_smmu_alloc_cd_tables(struct 
arm_smmu_domain *smmu_domain)
size_t l1size;
size_t max_contexts;
struct arm_smmu_device *smmu = smmu_domain->smmu;
-   struct arm_smmu_s1_cfg *cfg = _domain->s1_cfg;
+   struct arm_smmu_s1_cfg *cfg = smmu_domain->s1_cfg;
struct arm_smmu_ctx_desc_cfg *cdcfg = >cdcfg;
 
max_contexts = 1 << cfg->s1cdmax;
@@ -1748,7 +1746,7 @@ static void arm_smmu_free_cd_tables(struct 
arm_smmu_domain *smmu_domain)
int i;
size_t size, l1size;
struct arm_smmu_device *smmu = smmu_domain->smmu;
-   struct arm_smmu_ctx_desc_cfg *cdcfg = _domain->s1_cfg.cdcfg;
+   struct arm_smmu_ctx_desc_cfg *cdcfg = _domain->s1_cfg->cdcfg;
 
if (cdcfg->l1_desc) {
size = CTXDESC_L2_ENTRIES * (CTXDESC_CD_DWORDS << 3);
@@ -1839,17 +1837,8 @@ static void arm_smmu_write_strtab_ent(struct 
arm_smmu_master *master, u32 sid,
}
 
if (smmu_domain) {
-   switch (smmu_domain->stage) {
-   case ARM_SMMU_DOMAIN_S1:
-   s1_cfg = _domain->s1_cfg;
-   break;
-   case ARM_SMMU_DOMAIN_S2:
-   case ARM_SMMU_DOMAIN_NESTED:
-   s2_cfg = _domain->s2_cfg;
-   break;
-   default:
-   break;
-   }
+   s1_cfg = smmu_domain->s1_cfg;
+   s2_cfg = smmu_domain->s2_cfg;
}
 
if (val & STRTAB_STE_0_V) {
@@ -2286,11 +2275,11 @@ static void arm_smmu_tlb_inv_context(void *cookie)
 
if (smmu_domain->stage == ARM_SMMU_DOMAIN_S1) {
cmd.opcode  = CMDQ_OP_TLBI_NH_ASID;
-   cmd.tlbi.asid   = smmu_domain->s1_cfg.cd.asid;
+   cmd.tlbi.asid   = smmu_domain->s1_cfg->cd.asid;
cmd.tlbi.vmid   = 0;
} else {
cmd.opcode  = CMDQ_OP_TLBI_S12_VMALL;
-   cmd.tlbi.vmid   = smmu_domain->s2_cfg.vmid;
+   cmd.tlbi.vmid   = smmu_domain->s2_cfg->vmid;
}
 
/*
@@ -2324,10 +2313,10 @@ static void arm_smmu_tlb_inv_range(unsigned long iova, 
size_t size,
 
if (smmu_domain->stage == ARM_SMMU_DOMAIN_S1) {
cmd.opcode  = CMDQ_OP_TLBI_NH_VA;
-   cmd.tlbi.asid   = smmu_domain->s1_cfg.cd.asid;
+   cmd.tlbi.asid   = smmu_domain->s1_cfg->cd.asid;
} else {
cmd.opcode  = CMDQ_OP_TLBI_S2_IPA;
-   cmd.tlbi.vmid   = smmu_domain->s2_cfg.vmid;
+   cmd.tlbi.vmid   = smmu_domain->s2_cfg->vmid;
}
 
if (smmu->features & ARM_SMMU_FEAT_RANGE_INV) {
@@ -2477,22 +2466,24 @@ static void arm_smmu_domain_free(struct iommu_domain 
*domain)
 {
struct arm_smmu_domain *smmu_domain = to_smmu_domain(domain);
struct arm_smmu_device *smmu = smmu_domain->smmu;
+   struct 

[PATCH v11 00/13] SMMUv3 Nested Stage Setup (IOMMU part)

2020-04-14 Thread Eric Auger
This version fixes an issue observed by Shameer on an SMMU 3.2,
when moving from dual stage config to stage 1 only config.
The 2 high 64b of the STE now get reset. Otherwise, leaving the
S2TTB set may cause a C_BAD_STE error.

This series can be found at:
https://github.com/eauger/linux/tree/v5.6-2stage-v11_10.1
(including the VFIO part)
The QEMU fellow series still can be found at:
https://github.com/eauger/qemu/tree/v4.2.0-2stage-rfcv6

Users have expressed interest in that work and tested v9/v10:
- https://patchwork.kernel.org/cover/11039995/#23012381
- https://patchwork.kernel.org/cover/11039995/#23197235

Background:

This series brings the IOMMU part of HW nested paging support
in the SMMUv3. The VFIO part is submitted separately.

The IOMMU API is extended to support 2 new API functionalities:
1) pass the guest stage 1 configuration
2) pass stage 1 MSI bindings

Then those capabilities gets implemented in the SMMUv3 driver.

The virtualizer passes information through the VFIO user API
which cascades them to the iommu subsystem. This allows the guest
to own stage 1 tables and context descriptors (so-called PASID
table) while the host owns stage 2 tables and main configuration
structures (STE).

Best Regards

Eric


History:

v10 -> v11:
- S2TTB reset when S2 is off
- fix compil issue when CONFIG_IOMMU_DMA is not set

v9 -> v10:
- rebase on top of 5.6.0-rc3

v8 -> v9:
- rebase on 5.3
- split iommu/vfio parts

v6 -> v8:
- Implement VFIO-PCI device specific interrupt framework

v7 -> v8:
- rebase on top of v5.2-rc1 and especially
  8be39a1a04c1  iommu/arm-smmu-v3: Add a master->domain pointer
- dynamic alloc of s1_cfg/s2_cfg
- __arm_smmu_tlb_inv_asid/s1_range_nosync
- check there is no HW MSI regions
- asid invalidation using pasid extended struct (change in the uapi)
- add s1_live/s2_live checks
- move check about support of nested stages in domain finalise
- fixes in error reporting according to the discussion with Robin
- reordered the patches to have first iommu/smmuv3 patches and then
  VFIO patches

v6 -> v7:
- removed device handle from bind/unbind_guest_msi
- added "iommu/smmuv3: Nested mode single MSI doorbell per domain
  enforcement"
- added few uapi comments as suggested by Jean, Jacop and Alex

v5 -> v6:
- Fix compilation issue when CONFIG_IOMMU_API is unset

v4 -> v5:
- fix bug reported by Vincent: fault handler unregistration now happens in
  vfio_pci_release
- IOMMU_FAULT_PERM_* moved outside of struct definition + small
  uapi changes suggested by Kean-Philippe (except fetch_addr)
- iommu: introduce device fault report API: removed the PRI part.
- see individual logs for more details
- reset the ste abort flag on detach

v3 -> v4:
- took into account Alex, jean-Philippe and Robin's comments on v3
- rework of the smmuv3 driver integration
- add tear down ops for msi binding and PASID table binding
- fix S1 fault propagation
- put fault reporting patches at the beginning of the series following
  Jean-Philippe's request
- update of the cache invalidate and fault API uapis
- VFIO fault reporting rework with 2 separate regions and one mmappable
  segment for the fault queue
- moved to PATCH

v2 -> v3:
- When registering the S1 MSI binding we now store the device handle. This
  addresses Robin's comment about discimination of devices beonging to
  different S1 groups and using different physical MSI doorbells.
- Change the fault reporting API: use VFIO_PCI_DMA_FAULT_IRQ_INDEX to
  set the eventfd and expose the faults through an mmappable fault region

v1 -> v2:
- Added the fault reporting capability
- asid properly passed on invalidation (fix assignment of multiple
  devices)
- see individual change logs for more info

Eric Auger (11):
  iommu: Introduce bind/unbind_guest_msi
  iommu/smmuv3: Dynamically allocate s1_cfg and s2_cfg
  iommu/smmuv3: Get prepared for nested stage support
  iommu/smmuv3: Implement attach/detach_pasid_table
  iommu/smmuv3: Allow stage 1 invalidation with unmanaged ASIDs
  iommu/smmuv3: Implement cache_invalidate
  dma-iommu: Implement NESTED_MSI cookie
  iommu/smmuv3: Nested mode single MSI doorbell per domain enforcement
  iommu/smmuv3: Enforce incompatibility between nested mode and HW MSI
regions
  iommu/smmuv3: Implement bind/unbind_guest_msi
  iommu/smmuv3: Report non recoverable faults

Jacob Pan (1):
  iommu: Introduce attach/detach_pasid_table API

Jean-Philippe Brucker (1):
  iommu/arm-smmu-v3: Maintain a SID->device structure

 drivers/iommu/arm-smmu-v3.c | 744 
 drivers/iommu/dma-iommu.c   | 142 ++-
 drivers/iommu/iommu.c   |  56 +++
 include/linux/dma-iommu.h   |  16 +
 include/linux/iommu.h   |  38 ++
 include/uapi/linux/iommu.h  |  51 +++
 6 files changed, 975 insertions(+), 72 deletions(-)

-- 
2.20.1

___
iommu mailing list
iommu@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/iommu


[PATCH v11 01/13] iommu: Introduce attach/detach_pasid_table API

2020-04-14 Thread Eric Auger
From: Jacob Pan 

In virtualization use case, when a guest is assigned
a PCI host device, protected by a virtual IOMMU on the guest,
the physical IOMMU must be programmed to be consistent with
the guest mappings. If the physical IOMMU supports two
translation stages it makes sense to program guest mappings
onto the first stage/level (ARM/Intel terminology) while the host
owns the stage/level 2.

In that case, it is mandated to trap on guest configuration
settings and pass those to the physical iommu driver.

This patch adds a new API to the iommu subsystem that allows
to set/unset the pasid table information.

A generic iommu_pasid_table_config struct is introduced in
a new iommu.h uapi header. This is going to be used by the VFIO
user API.

Signed-off-by: Jean-Philippe Brucker 
Signed-off-by: Liu, Yi L 
Signed-off-by: Ashok Raj 
Signed-off-by: Jacob Pan 
Signed-off-by: Eric Auger 
Reviewed-by: Jean-Philippe Brucker 
---
 drivers/iommu/iommu.c  | 19 ++
 include/linux/iommu.h  | 18 ++
 include/uapi/linux/iommu.h | 51 ++
 3 files changed, 88 insertions(+)

diff --git a/drivers/iommu/iommu.c b/drivers/iommu/iommu.c
index 2b471419e26c..b71ad56f8c99 100644
--- a/drivers/iommu/iommu.c
+++ b/drivers/iommu/iommu.c
@@ -1723,6 +1723,25 @@ int iommu_sva_unbind_gpasid(struct iommu_domain *domain, 
struct device *dev,
 }
 EXPORT_SYMBOL_GPL(iommu_sva_unbind_gpasid);
 
+int iommu_attach_pasid_table(struct iommu_domain *domain,
+struct iommu_pasid_table_config *cfg)
+{
+   if (unlikely(!domain->ops->attach_pasid_table))
+   return -ENODEV;
+
+   return domain->ops->attach_pasid_table(domain, cfg);
+}
+EXPORT_SYMBOL_GPL(iommu_attach_pasid_table);
+
+void iommu_detach_pasid_table(struct iommu_domain *domain)
+{
+   if (unlikely(!domain->ops->detach_pasid_table))
+   return;
+
+   domain->ops->detach_pasid_table(domain);
+}
+EXPORT_SYMBOL_GPL(iommu_detach_pasid_table);
+
 static void __iommu_detach_device(struct iommu_domain *domain,
  struct device *dev)
 {
diff --git a/include/linux/iommu.h b/include/linux/iommu.h
index 7ef8b0bda695..3e1057c3585a 100644
--- a/include/linux/iommu.h
+++ b/include/linux/iommu.h
@@ -248,6 +248,8 @@ struct iommu_iotlb_gather {
  * @cache_invalidate: invalidate translation caches
  * @sva_bind_gpasid: bind guest pasid and mm
  * @sva_unbind_gpasid: unbind guest pasid and mm
+ * @attach_pasid_table: attach a pasid table
+ * @detach_pasid_table: detach the pasid table
  * @pgsize_bitmap: bitmap of all possible supported page sizes
  * @owner: Driver module providing these ops
  */
@@ -307,6 +309,9 @@ struct iommu_ops {
  void *drvdata);
void (*sva_unbind)(struct iommu_sva *handle);
int (*sva_get_pasid)(struct iommu_sva *handle);
+   int (*attach_pasid_table)(struct iommu_domain *domain,
+ struct iommu_pasid_table_config *cfg);
+   void (*detach_pasid_table)(struct iommu_domain *domain);
 
int (*page_response)(struct device *dev,
 struct iommu_fault_event *evt,
@@ -446,6 +451,9 @@ extern int iommu_sva_bind_gpasid(struct iommu_domain 
*domain,
struct device *dev, struct iommu_gpasid_bind_data *data);
 extern int iommu_sva_unbind_gpasid(struct iommu_domain *domain,
struct device *dev, ioasid_t pasid);
+extern int iommu_attach_pasid_table(struct iommu_domain *domain,
+   struct iommu_pasid_table_config *cfg);
+extern void iommu_detach_pasid_table(struct iommu_domain *domain);
 extern struct iommu_domain *iommu_get_domain_for_dev(struct device *dev);
 extern struct iommu_domain *iommu_get_dma_domain(struct device *dev);
 extern int iommu_map(struct iommu_domain *domain, unsigned long iova,
@@ -1048,6 +1056,16 @@ iommu_aux_get_pasid(struct iommu_domain *domain, struct 
device *dev)
return -ENODEV;
 }
 
+static inline
+int iommu_attach_pasid_table(struct iommu_domain *domain,
+struct iommu_pasid_table_config *cfg)
+{
+   return -ENODEV;
+}
+
+static inline
+void iommu_detach_pasid_table(struct iommu_domain *domain) {}
+
 static inline struct iommu_sva *
 iommu_sva_bind_device(struct device *dev, struct mm_struct *mm, void *drvdata)
 {
diff --git a/include/uapi/linux/iommu.h b/include/uapi/linux/iommu.h
index 4ad3496e5c43..8d00be10dc6d 100644
--- a/include/uapi/linux/iommu.h
+++ b/include/uapi/linux/iommu.h
@@ -321,4 +321,55 @@ struct iommu_gpasid_bind_data {
};
 };
 
+/**
+ * struct iommu_pasid_smmuv3 - ARM SMMUv3 Stream Table Entry stage 1 related
+ * information
+ * @version: API version of this structure
+ * @s1fmt: STE s1fmt (format of the CD table: single CD, linear table
+ * or 2-level table)
+ * @s1dss: STE s1dss (specifies the behavior when @pasid_bits != 0
+ *

Re: [RFC PATCH 1/4] bus: fsl-mc: add custom .dma_configure implementation

2020-04-14 Thread Lorenzo Pieralisi
On Wed, Mar 25, 2020 at 06:48:55PM +0200, Laurentiu Tudor wrote:
> Hi Lorenzo,
> 
> On 3/25/2020 2:51 PM, Lorenzo Pieralisi wrote:
> > On Thu, Feb 27, 2020 at 12:05:39PM +0200, laurentiu.tu...@nxp.com wrote:
> >> From: Laurentiu Tudor 
> >>
> >> The devices on this bus are not discovered by way of device tree
> >> but by queries to the firmware. It makes little sense to trick the
> >> generic of layer into thinking that these devices are of related so
> >> that we can get our dma configuration. Instead of doing that, add
> >> our custom dma configuration implementation.
> >>
> >> Signed-off-by: Laurentiu Tudor 
> >> ---
> >>  drivers/bus/fsl-mc/fsl-mc-bus.c | 31 ++-
> >>  1 file changed, 30 insertions(+), 1 deletion(-)
> >>
> >> diff --git a/drivers/bus/fsl-mc/fsl-mc-bus.c 
> >> b/drivers/bus/fsl-mc/fsl-mc-bus.c
> >> index 36eb25f82c8e..eafaa0e0b906 100644
> >> --- a/drivers/bus/fsl-mc/fsl-mc-bus.c
> >> +++ b/drivers/bus/fsl-mc/fsl-mc-bus.c
> >> @@ -132,11 +132,40 @@ static int fsl_mc_bus_uevent(struct device *dev, 
> >> struct kobj_uevent_env *env)
> >>  static int fsl_mc_dma_configure(struct device *dev)
> >>  {
> >>struct device *dma_dev = dev;
> >> +  struct iommu_fwspec *fwspec;
> >> +  const struct iommu_ops *iommu_ops;
> >> +  struct fsl_mc_device *mc_dev = to_fsl_mc_device(dev);
> >> +  int ret;
> >> +  u32 icid;
> >>  
> >>while (dev_is_fsl_mc(dma_dev))
> >>dma_dev = dma_dev->parent;
> >>  
> >> -  return of_dma_configure(dev, dma_dev->of_node, 0);
> >> +  fwspec = dev_iommu_fwspec_get(dma_dev);
> >> +  if (!fwspec)
> >> +  return -ENODEV;
> >> +  iommu_ops = iommu_ops_from_fwnode(fwspec->iommu_fwnode);
> >> +  if (!iommu_ops)
> >> +  return -ENODEV;
> >> +
> >> +  ret = iommu_fwspec_init(dev, fwspec->iommu_fwnode, iommu_ops);
> >> +  if (ret)
> >> +  return ret;
> >> +
> >> +  icid = mc_dev->icid;
> >> +  ret = iommu_fwspec_add_ids(dev, , 1);
> > 
> > I see. So with this patch we would use the MC named component only to
> > retrieve the iommu_ops
> 
> Right. I'd also add that the implementation tries to follow the existing
> standard .dma_configure implementations, e.g. of_dma_configure +
> of_iommu_configure. I'd also note that similarly to the ACPI case, this
> MC FW device is probed as a platform device in the DT scenario, binding
> here [1].
> A similar approach is used for the retrieval of the msi irq domain, see
> following patch.
> 
> > - the streamid are injected directly here bypassing OF/IORT bindings 
> > translations altogether. 
> 
> Actually I've submitted a v2 [2] that calls into .of_xlate() to allow
> the smmu driver to do some processing on the raw streamid coming from
> the firmware. I have not yet tested this with ACPI but expect it to
> work, however, it's debatable how valid is this approach in the context
> of ACPI.

Actually, what I think you need is of_map_rid() (and an IORT
equivalent, that I am going to write - generalizing iort_msi_map_rid()).

Would that be enough to enable IORT "normal" mappings in the MC bus
named components ?

Thanks,
Lorenzo
___
iommu mailing list
iommu@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/iommu


[PATCH] iommu: spapr_tce: Disable compile testing to fix build on book3s_32 config

2020-04-14 Thread Krzysztof Kozlowski
Although SPAPR_TCE_IOMMU itself can be compile tested on certain PowerPC
configurations, its presence makes arch/powerpc/kvm/Makefile to select
modules which do not build in such configuration.

The arch/powerpc/kvm/ modules use kvm_arch.spapr_tce_tables which exists
only with CONFIG_PPC_BOOK3S_64.  However these modules are selected when
COMPILE_TEST and SPAPR_TCE_IOMMU are chosen leading to build failures:

In file included from arch/powerpc/include/asm/book3s/64/mmu-hash.h:20:0,
 from arch/powerpc/kvm/book3s_64_vio_hv.c:22:
arch/powerpc/include/asm/book3s/64/pgtable.h:17:0: error: "_PAGE_EXEC" 
redefined [-Werror]
 #define _PAGE_EXEC  0x1 /* execute permission */

In file included from arch/powerpc/include/asm/book3s/32/pgtable.h:8:0,
 from arch/powerpc/include/asm/book3s/pgtable.h:8,
 from arch/powerpc/include/asm/pgtable.h:18,
 from include/linux/mm.h:95,
 from arch/powerpc/include/asm/io.h:29,
 from include/linux/io.h:13,
 from include/linux/irq.h:20,
 from arch/powerpc/include/asm/hardirq.h:6,
 from include/linux/hardirq.h:9,
 from include/linux/kvm_host.h:7,
 from arch/powerpc/kvm/book3s_64_vio_hv.c:12:
arch/powerpc/include/asm/book3s/32/hash.h:29:0: note: this is the location 
of the previous definition
 #define _PAGE_EXEC 0x200 /* software: exec allowed */

Reported-by: Geert Uytterhoeven 
Fixes: e93a1695d7fb ("iommu: Enable compile testing for some of drivers")
Signed-off-by: Krzysztof Kozlowski 
---
 drivers/iommu/Kconfig | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/iommu/Kconfig b/drivers/iommu/Kconfig
index 58b4a4dbfc78..3532b1ead19d 100644
--- a/drivers/iommu/Kconfig
+++ b/drivers/iommu/Kconfig
@@ -362,7 +362,7 @@ config IPMMU_VMSA
 
 config SPAPR_TCE_IOMMU
bool "sPAPR TCE IOMMU Support"
-   depends on PPC_POWERNV || PPC_PSERIES || (PPC && COMPILE_TEST)
+   depends on PPC_POWERNV || PPC_PSERIES
select IOMMU_API
help
  Enables bits of IOMMU API required by VFIO. The iommu_ops
-- 
2.17.1

___
iommu mailing list
iommu@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/iommu


Re: [PATCH] iommu/exynos: Get rid of 'struct exynos_iommu_owner' exynos_iommu_owner

2020-04-14 Thread Joerg Roedel
On Thu, Apr 09, 2020 at 03:58:00PM +0200, Marek Szyprowski wrote:
> I've checked and it works fine on top of 
> ff68eb23308e6538ec7864c83d39540f423bbe90. However I'm not a fan of 
> removing this 'owner' structure. It gave a nice abstraction for the all 
> SYSMMU controllers for the given device (although most devices in the 
> system have only one SYSMMU). Why this structure is a problem for your 
> rework?

Okay, the structure itself is not a problem, I just thought it is not
really necessary. But to keep things simple I've taken another approach
for v2 of this series: Just use the first SYSMMU of the controllers list
to link the device and the IOMMU. When the owner structure exists there
is always one entry in this list, so that should work fine.

Regards,

Joerg

___
iommu mailing list
iommu@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/iommu


Re: [RFC PATCH 33/34] iommu: Remove add_device()/remove_device() code-paths

2020-04-14 Thread Joerg Roedel
Hi Marek,

On Fri, Apr 10, 2020 at 12:39:38PM +0200, Marek Szyprowski wrote:
> > +   if (!group->default_domain)
> > +   continue;
> 
> It doesn't look straight from the above diff, but this continue leaks 
> group->lock taken.

You are right, thanks for the review! I fixed it in v2.

Regards,

Joerg

___
iommu mailing list
iommu@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/iommu


[PATCH 26/29] mm: remove vmalloc_user_node_flags

2020-04-14 Thread Christoph Hellwig
Open code it in __bpf_map_area_alloc, which is the only caller.  Also
clean up __bpf_map_area_alloc to have a single vmalloc call with
slightly different flags instead of the current two different calls.

For this to compile for the nommu case add a __vmalloc_node_range stub
to nommu.c.

Signed-off-by: Christoph Hellwig 
Acked-by: Peter Zijlstra (Intel) 
Acked-by: Johannes Weiner 
---
 include/linux/vmalloc.h |  1 -
 kernel/bpf/syscall.c| 24 ++--
 mm/nommu.c  | 14 --
 mm/vmalloc.c| 20 
 4 files changed, 22 insertions(+), 37 deletions(-)

diff --git a/include/linux/vmalloc.h b/include/linux/vmalloc.h
index 108f49b47756..f90f2946aac2 100644
--- a/include/linux/vmalloc.h
+++ b/include/linux/vmalloc.h
@@ -106,7 +106,6 @@ extern void *vzalloc(unsigned long size);
 extern void *vmalloc_user(unsigned long size);
 extern void *vmalloc_node(unsigned long size, int node);
 extern void *vzalloc_node(unsigned long size, int node);
-extern void *vmalloc_user_node_flags(unsigned long size, int node, gfp_t 
flags);
 extern void *vmalloc_exec(unsigned long size);
 extern void *vmalloc_32(unsigned long size);
 extern void *vmalloc_32_user(unsigned long size);
diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c
index 48d98ea8fad6..dd30b334c554 100644
--- a/kernel/bpf/syscall.c
+++ b/kernel/bpf/syscall.c
@@ -25,6 +25,7 @@
 #include 
 #include 
 #include 
+#include 
 #include 
 
 #define IS_FD_ARRAY(map) ((map)->map_type == BPF_MAP_TYPE_PERF_EVENT_ARRAY || \
@@ -281,26 +282,29 @@ static void *__bpf_map_area_alloc(u64 size, int 
numa_node, bool mmapable)
 * __GFP_RETRY_MAYFAIL to avoid such situations.
 */
 
-   const gfp_t flags = __GFP_NOWARN | __GFP_ZERO;
+   const gfp_t gfp = __GFP_NOWARN | __GFP_ZERO;
+   unsigned int flags = 0;
+   unsigned long align = 1;
void *area;
 
if (size >= SIZE_MAX)
return NULL;
 
/* kmalloc()'ed memory can't be mmap()'ed */
-   if (!mmapable && size <= (PAGE_SIZE << PAGE_ALLOC_COSTLY_ORDER)) {
-   area = kmalloc_node(size, GFP_USER | __GFP_NORETRY | flags,
+   if (mmapable) {
+   BUG_ON(!PAGE_ALIGNED(size));
+   align = SHMLBA;
+   flags = VM_USERMAP;
+   } else if (size <= (PAGE_SIZE << PAGE_ALLOC_COSTLY_ORDER)) {
+   area = kmalloc_node(size, gfp | GFP_USER | __GFP_NORETRY,
numa_node);
if (area != NULL)
return area;
}
-   if (mmapable) {
-   BUG_ON(!PAGE_ALIGNED(size));
-   return vmalloc_user_node_flags(size, numa_node, GFP_KERNEL |
-  __GFP_RETRY_MAYFAIL | flags);
-   }
-   return __vmalloc_node(size, 1, GFP_KERNEL | __GFP_RETRY_MAYFAIL | flags,
- numa_node, __builtin_return_address(0));
+
+   return __vmalloc_node_range(size, align, VMALLOC_START, VMALLOC_END,
+   gfp | GFP_KERNEL | __GFP_RETRY_MAYFAIL, PAGE_KERNEL,
+   flags, numa_node, __builtin_return_address(0));
 }
 
 void *bpf_map_area_alloc(u64 size, int numa_node)
diff --git a/mm/nommu.c b/mm/nommu.c
index 81a86cd85893..b42cd6003d7d 100644
--- a/mm/nommu.c
+++ b/mm/nommu.c
@@ -150,6 +150,14 @@ void *__vmalloc(unsigned long size, gfp_t gfp_mask)
 }
 EXPORT_SYMBOL(__vmalloc);
 
+void *__vmalloc_node_range(unsigned long size, unsigned long align,
+   unsigned long start, unsigned long end, gfp_t gfp_mask,
+   pgprot_t prot, unsigned long vm_flags, int node,
+   const void *caller)
+{
+   return __vmalloc(size, flags);
+}
+
 void *__vmalloc_node(unsigned long size, unsigned long align, gfp_t gfp_mask,
int node, const void *caller)
 {
@@ -180,12 +188,6 @@ void *vmalloc_user(unsigned long size)
 }
 EXPORT_SYMBOL(vmalloc_user);
 
-void *vmalloc_user_node_flags(unsigned long size, int node, gfp_t flags)
-{
-   return __vmalloc_user_flags(size, flags | __GFP_ZERO);
-}
-EXPORT_SYMBOL(vmalloc_user_node_flags);
-
 struct page *vmalloc_to_page(const void *addr)
 {
return virt_to_page(addr);
diff --git a/mm/vmalloc.c b/mm/vmalloc.c
index 333fbe77255a..f6f2acdaf70c 100644
--- a/mm/vmalloc.c
+++ b/mm/vmalloc.c
@@ -2658,26 +2658,6 @@ void *vzalloc_node(unsigned long size, int node)
 }
 EXPORT_SYMBOL(vzalloc_node);
 
-/**
- * vmalloc_user_node_flags - allocate memory for userspace on a specific node
- * @size: allocation size
- * @node: numa node
- * @flags: flags for the page level allocator
- *
- * The resulting memory area is zeroed so it can be mapped to userspace
- * without leaking data.
- *
- * Return: pointer to the allocated memory or %NULL on error
- */
-void *vmalloc_user_node_flags(unsigned long size, int node, gfp_t flags)
-{
-   return __vmalloc_node_range(size, SHMLBA,  VMALLOC_START, VMALLOC_END,
- 

[PATCH v2 00/33] iommu: Move iommu_group setup to IOMMU core code

2020-04-14 Thread Joerg Roedel
Hi,

here is the second version of this patch-set. The first version with
some more introductory text can be found here:

https://lore.kernel.org/lkml/20200407183742.4344-1-j...@8bytes.org/

Changes v1->v2:

* Rebased to v5.7-rc1

* Re-wrote the arm-smmu changes as suggested by Robin Murphy

* Re-worked the Exynos patches to hopefully not break the
  driver anymore

* Fixed a missing mutex_unlock() reported by Marek Szyprowski,
  thanks for that.

There is also a git-branch available with these patches applied:


https://git.kernel.org/pub/scm/linux/kernel/git/joro/linux.git/log/?h=iommu-probe-device-v2

Please review.

Thanks,

Joerg

Joerg Roedel (32):
  iommu: Move default domain allocation to separate function
  iommu/amd: Implement iommu_ops->def_domain_type call-back
  iommu/vt-d: Wire up iommu_ops->def_domain_type
  iommu/amd: Remove dma_mask check from check_device()
  iommu/amd: Return -ENODEV in add_device when device is not handled by
IOMMU
  iommu: Add probe_device() and remove_device() call-backs
  iommu: Move default domain allocation to iommu_probe_device()
  iommu: Keep a list of allocated groups in __iommu_probe_device()
  iommu: Move new probe_device path to separate function
  iommu: Split off default domain allocation from group assignment
  iommu: Move iommu_group_create_direct_mappings() out of
iommu_group_add_device()
  iommu: Export bus_iommu_probe() and make is safe for re-probing
  iommu/amd: Remove dev_data->passthrough
  iommu/amd: Convert to probe/release_device() call-backs
  iommu/vt-d: Convert to probe/release_device() call-backs
  iommu/arm-smmu: Convert to probe/release_device() call-backs
  iommu/pamu: Convert to probe/release_device() call-backs
  iommu/s390: Convert to probe/release_device() call-backs
  iommu/virtio: Convert to probe/release_device() call-backs
  iommu/msm: Convert to probe/release_device() call-backs
  iommu/mediatek: Convert to probe/release_device() call-backs
  iommu/mediatek-v1 Convert to probe/release_device() call-backs
  iommu/qcom: Convert to probe/release_device() call-backs
  iommu/rockchip: Convert to probe/release_device() call-backs
  iommu/tegra: Convert to probe/release_device() call-backs
  iommu/renesas: Convert to probe/release_device() call-backs
  iommu/omap: Remove orphan_dev tracking
  iommu/omap: Convert to probe/release_device() call-backs
  iommu/exynos: Use first SYSMMU in controllers list for IOMMU core
  iommu/exynos: Convert to probe/release_device() call-backs
  iommu: Remove add_device()/remove_device() code-paths
  iommu: Unexport iommu_group_get_for_dev()

Sai Praneeth Prakhya (1):
  iommu: Add def_domain_type() callback in iommu_ops

 drivers/iommu/amd_iommu.c   |  97 
 drivers/iommu/amd_iommu_types.h |   1 -
 drivers/iommu/arm-smmu-v3.c |  38 +--
 drivers/iommu/arm-smmu.c|  39 ++--
 drivers/iommu/exynos-iommu.c|  24 +-
 drivers/iommu/fsl_pamu_domain.c |  22 +-
 drivers/iommu/intel-iommu.c |  68 +-
 drivers/iommu/iommu.c   | 393 +---
 drivers/iommu/ipmmu-vmsa.c  |  60 ++---
 drivers/iommu/msm_iommu.c   |  34 +--
 drivers/iommu/mtk_iommu.c   |  24 +-
 drivers/iommu/mtk_iommu_v1.c|  50 ++--
 drivers/iommu/omap-iommu.c  |  99 ++--
 drivers/iommu/qcom_iommu.c  |  24 +-
 drivers/iommu/rockchip-iommu.c  |  26 +--
 drivers/iommu/s390-iommu.c  |  22 +-
 drivers/iommu/tegra-gart.c  |  24 +-
 drivers/iommu/tegra-smmu.c  |  31 +--
 drivers/iommu/virtio-iommu.c|  41 +---
 include/linux/iommu.h   |  21 +-
 20 files changed, 533 insertions(+), 605 deletions(-)

-- 
2.17.1

___
iommu mailing list
iommu@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/iommu


[PATCH v2 33/33] iommu: Unexport iommu_group_get_for_dev()

2020-04-14 Thread Joerg Roedel
From: Joerg Roedel 

The function is now only used in IOMMU core code and shouldn't be used
outside of it anyway, so remove the export for it.

Signed-off-by: Joerg Roedel 
---
 drivers/iommu/iommu.c | 4 ++--
 include/linux/iommu.h | 1 -
 2 files changed, 2 insertions(+), 3 deletions(-)

diff --git a/drivers/iommu/iommu.c b/drivers/iommu/iommu.c
index 9a7120746b8e..e9413732c61e 100644
--- a/drivers/iommu/iommu.c
+++ b/drivers/iommu/iommu.c
@@ -91,6 +91,7 @@ static void __iommu_detach_group(struct iommu_domain *domain,
 struct iommu_group *group);
 static int iommu_create_device_direct_mappings(struct iommu_group *group,
   struct device *dev);
+static struct iommu_group *iommu_group_get_for_dev(struct device *dev);
 
 #define IOMMU_GROUP_ATTR(_name, _mode, _show, _store)  \
 struct iommu_group_attribute iommu_group_attr_##_name =\
@@ -1483,7 +1484,7 @@ static int iommu_alloc_default_domain(struct device *dev)
  * to the returned IOMMU group, which will already include the provided
  * device.  The reference should be released with iommu_group_put().
  */
-struct iommu_group *iommu_group_get_for_dev(struct device *dev)
+static struct iommu_group *iommu_group_get_for_dev(struct device *dev)
 {
const struct iommu_ops *ops = dev->bus->iommu_ops;
struct iommu_group *group;
@@ -1514,7 +1515,6 @@ struct iommu_group *iommu_group_get_for_dev(struct device 
*dev)
 
return ERR_PTR(ret);
 }
-EXPORT_SYMBOL(iommu_group_get_for_dev);
 
 struct iommu_domain *iommu_group_default_domain(struct iommu_group *group)
 {
diff --git a/include/linux/iommu.h b/include/linux/iommu.h
index dd076366383f..7cfd2dddb49d 100644
--- a/include/linux/iommu.h
+++ b/include/linux/iommu.h
@@ -527,7 +527,6 @@ extern int iommu_page_response(struct device *dev,
   struct iommu_page_response *msg);
 
 extern int iommu_group_id(struct iommu_group *group);
-extern struct iommu_group *iommu_group_get_for_dev(struct device *dev);
 extern struct iommu_domain *iommu_group_default_domain(struct iommu_group *);
 
 extern int iommu_domain_get_attr(struct iommu_domain *domain, enum iommu_attr,
-- 
2.17.1

___
iommu mailing list
iommu@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/iommu


[PATCH 25/29] mm: switch the test_vmalloc module to use __vmalloc_node

2020-04-14 Thread Christoph Hellwig
No need to export the very low-level __vmalloc_node_range when the
test module can use a slightly higher level variant.

Signed-off-by: Christoph Hellwig 
Acked-by: Peter Zijlstra (Intel) 
---
 lib/test_vmalloc.c | 26 +++---
 mm/vmalloc.c   | 17 -
 2 files changed, 15 insertions(+), 28 deletions(-)

diff --git a/lib/test_vmalloc.c b/lib/test_vmalloc.c
index 8bbefcaddfe8..cd6aef05dfb4 100644
--- a/lib/test_vmalloc.c
+++ b/lib/test_vmalloc.c
@@ -91,12 +91,8 @@ static int random_size_align_alloc_test(void)
 */
size = ((rnd % 10) + 1) * PAGE_SIZE;
 
-   ptr = __vmalloc_node_range(size, align,
-  VMALLOC_START, VMALLOC_END,
-  GFP_KERNEL | __GFP_ZERO,
-  PAGE_KERNEL,
-  0, 0, __builtin_return_address(0));
-
+   ptr = __vmalloc_node(size, align, GFP_KERNEL | __GFP_ZERO,
+   __builtin_return_address(0));
if (!ptr)
return -1;
 
@@ -118,12 +114,8 @@ static int align_shift_alloc_test(void)
for (i = 0; i < BITS_PER_LONG; i++) {
align = ((unsigned long) 1) << i;
 
-   ptr = __vmalloc_node_range(PAGE_SIZE, align,
-   VMALLOC_START, VMALLOC_END,
-   GFP_KERNEL | __GFP_ZERO,
-   PAGE_KERNEL,
-   0, 0, __builtin_return_address(0));
-
+   ptr = __vmalloc_node(PAGE_SIZE, align, GFP_KERNEL | __GFP_ZERO,
+   __builtin_return_address(0));
if (!ptr)
return -1;
 
@@ -139,13 +131,9 @@ static int fix_align_alloc_test(void)
int i;
 
for (i = 0; i < test_loop_count; i++) {
-   ptr = __vmalloc_node_range(5 * PAGE_SIZE,
-   THREAD_ALIGN << 1,
-   VMALLOC_START, VMALLOC_END,
-   GFP_KERNEL | __GFP_ZERO,
-   PAGE_KERNEL,
-   0, 0, __builtin_return_address(0));
-
+   ptr = __vmalloc_node(5 * PAGE_SIZE, THREAD_ALIGN << 1,
+   GFP_KERNEL | __GFP_ZERO,
+   __builtin_return_address(0));
if (!ptr)
return -1;
 
diff --git a/mm/vmalloc.c b/mm/vmalloc.c
index ae8249ef5821..333fbe77255a 100644
--- a/mm/vmalloc.c
+++ b/mm/vmalloc.c
@@ -2522,15 +2522,6 @@ void *__vmalloc_node_range(unsigned long size, unsigned 
long align,
return NULL;
 }
 
-/*
- * This is only for performance analysis of vmalloc and stress purpose.
- * It is required by vmalloc test module, therefore do not use it other
- * than that.
- */
-#ifdef CONFIG_TEST_VMALLOC_MODULE
-EXPORT_SYMBOL_GPL(__vmalloc_node_range);
-#endif
-
 /**
  * __vmalloc_node - allocate virtually contiguous memory
  * @size:  allocation size
@@ -2556,6 +2547,14 @@ void *__vmalloc_node(unsigned long size, unsigned long 
align,
return __vmalloc_node_range(size, align, VMALLOC_START, VMALLOC_END,
gfp_mask, PAGE_KERNEL, 0, node, caller);
 }
+/*
+ * This is only for performance analysis of vmalloc and stress purpose.
+ * It is required by vmalloc test module, therefore do not use it other
+ * than that.
+ */
+#ifdef CONFIG_TEST_VMALLOC_MODULE
+EXPORT_SYMBOL_GPL(__vmalloc_node);
+#endif
 
 void *__vmalloc(unsigned long size, gfp_t gfp_mask)
 {
-- 
2.25.1

___
iommu mailing list
iommu@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/iommu


[PATCH 18/29] mm: remove the prot argument from vm_map_ram

2020-04-14 Thread Christoph Hellwig
This is always PAGE_KERNEL - for long term mappings with other
properties vmap should be used.

Signed-off-by: Christoph Hellwig 
Acked-by: Peter Zijlstra (Intel) 
---
 drivers/gpu/drm/i915/gem/selftests/mock_dmabuf.c   | 2 +-
 drivers/media/common/videobuf2/videobuf2-dma-sg.c  | 3 +--
 drivers/media/common/videobuf2/videobuf2-vmalloc.c | 3 +--
 fs/erofs/decompressor.c| 2 +-
 fs/xfs/xfs_buf.c   | 2 +-
 include/linux/vmalloc.h| 3 +--
 mm/nommu.c | 2 +-
 mm/vmalloc.c   | 4 ++--
 8 files changed, 9 insertions(+), 12 deletions(-)

diff --git a/drivers/gpu/drm/i915/gem/selftests/mock_dmabuf.c 
b/drivers/gpu/drm/i915/gem/selftests/mock_dmabuf.c
index 9272bef57092..debaf7b18ab5 100644
--- a/drivers/gpu/drm/i915/gem/selftests/mock_dmabuf.c
+++ b/drivers/gpu/drm/i915/gem/selftests/mock_dmabuf.c
@@ -66,7 +66,7 @@ static void *mock_dmabuf_vmap(struct dma_buf *dma_buf)
 {
struct mock_dmabuf *mock = to_mock(dma_buf);
 
-   return vm_map_ram(mock->pages, mock->npages, 0, PAGE_KERNEL);
+   return vm_map_ram(mock->pages, mock->npages, 0);
 }
 
 static void mock_dmabuf_vunmap(struct dma_buf *dma_buf, void *vaddr)
diff --git a/drivers/media/common/videobuf2/videobuf2-dma-sg.c 
b/drivers/media/common/videobuf2/videobuf2-dma-sg.c
index 6db60e9d5183..92072a08af25 100644
--- a/drivers/media/common/videobuf2/videobuf2-dma-sg.c
+++ b/drivers/media/common/videobuf2/videobuf2-dma-sg.c
@@ -309,8 +309,7 @@ static void *vb2_dma_sg_vaddr(void *buf_priv)
if (buf->db_attach)
buf->vaddr = dma_buf_vmap(buf->db_attach->dmabuf);
else
-   buf->vaddr = vm_map_ram(buf->pages,
-   buf->num_pages, -1, PAGE_KERNEL);
+   buf->vaddr = vm_map_ram(buf->pages, buf->num_pages, -1);
}
 
/* add offset in case userptr is not page-aligned */
diff --git a/drivers/media/common/videobuf2/videobuf2-vmalloc.c 
b/drivers/media/common/videobuf2/videobuf2-vmalloc.c
index 1a4f0ca87c7c..c66fda4a65e4 100644
--- a/drivers/media/common/videobuf2/videobuf2-vmalloc.c
+++ b/drivers/media/common/videobuf2/videobuf2-vmalloc.c
@@ -107,8 +107,7 @@ static void *vb2_vmalloc_get_userptr(struct device *dev, 
unsigned long vaddr,
buf->vaddr = (__force void *)
ioremap(__pfn_to_phys(nums[0]), size + offset);
} else {
-   buf->vaddr = vm_map_ram(frame_vector_pages(vec), n_pages, -1,
-   PAGE_KERNEL);
+   buf->vaddr = vm_map_ram(frame_vector_pages(vec), n_pages, -1);
}
 
if (!buf->vaddr)
diff --git a/fs/erofs/decompressor.c b/fs/erofs/decompressor.c
index 5d2d81940679..7628816f2453 100644
--- a/fs/erofs/decompressor.c
+++ b/fs/erofs/decompressor.c
@@ -274,7 +274,7 @@ static int z_erofs_decompress_generic(struct 
z_erofs_decompress_req *rq,
 
i = 0;
while (1) {
-   dst = vm_map_ram(rq->out, nrpages_out, -1, PAGE_KERNEL);
+   dst = vm_map_ram(rq->out, nrpages_out, -1);
 
/* retry two more times (totally 3 times) */
if (dst || ++i >= 3)
diff --git a/fs/xfs/xfs_buf.c b/fs/xfs/xfs_buf.c
index 9ec3eaf1c618..65538d18e64f 100644
--- a/fs/xfs/xfs_buf.c
+++ b/fs/xfs/xfs_buf.c
@@ -477,7 +477,7 @@ _xfs_buf_map_pages(
nofs_flag = memalloc_nofs_save();
do {
bp->b_addr = vm_map_ram(bp->b_pages, bp->b_page_count,
-   -1, PAGE_KERNEL);
+   -1);
if (bp->b_addr)
break;
vm_unmap_aliases();
diff --git a/include/linux/vmalloc.h b/include/linux/vmalloc.h
index 15ffbd8e8e65..9273b1a91ca5 100644
--- a/include/linux/vmalloc.h
+++ b/include/linux/vmalloc.h
@@ -88,8 +88,7 @@ struct vmap_area {
  * Highlevel APIs for driver use
  */
 extern void vm_unmap_ram(const void *mem, unsigned int count);
-extern void *vm_map_ram(struct page **pages, unsigned int count,
-   int node, pgprot_t prot);
+extern void *vm_map_ram(struct page **pages, unsigned int count, int node);
 extern void vm_unmap_aliases(void);
 
 #ifdef CONFIG_MMU
diff --git a/mm/nommu.c b/mm/nommu.c
index 318df4e236c9..4f07b7ef0297 100644
--- a/mm/nommu.c
+++ b/mm/nommu.c
@@ -351,7 +351,7 @@ void vunmap(const void *addr)
 }
 EXPORT_SYMBOL(vunmap);
 
-void *vm_map_ram(struct page **pages, unsigned int count, int node, pgprot_t 
prot)
+void *vm_map_ram(struct page **pages, unsigned int count, int node)
 {
BUG();
return NULL;
diff --git a/mm/vmalloc.c b/mm/vmalloc.c
index 258220b203f1..7356b3f07bd8 100644
--- a/mm/vmalloc.c
+++ b/mm/vmalloc.c
@@ -1834,7 +1834,7 @@ 

[PATCH v2 25/33] iommu/rockchip: Convert to probe/release_device() call-backs

2020-04-14 Thread Joerg Roedel
From: Joerg Roedel 

Convert the Rockchip IOMMU driver to use the probe_device() and
release_device() call-backs of iommu_ops, so that the iommu core code
does the group and sysfs setup.

Signed-off-by: Joerg Roedel 
---
 drivers/iommu/rockchip-iommu.c | 26 +++---
 1 file changed, 7 insertions(+), 19 deletions(-)

diff --git a/drivers/iommu/rockchip-iommu.c b/drivers/iommu/rockchip-iommu.c
index b33cdd5aad81..d25c2486ca07 100644
--- a/drivers/iommu/rockchip-iommu.c
+++ b/drivers/iommu/rockchip-iommu.c
@@ -1054,40 +1054,28 @@ static void rk_iommu_domain_free(struct iommu_domain 
*domain)
kfree(rk_domain);
 }
 
-static int rk_iommu_add_device(struct device *dev)
+static struct iommu_device *rk_iommu_probe_device(struct device *dev)
 {
-   struct iommu_group *group;
-   struct rk_iommu *iommu;
struct rk_iommudata *data;
+   struct rk_iommu *iommu;
 
data = dev->archdata.iommu;
if (!data)
-   return -ENODEV;
+   return ERR_PTR(-ENODEV);
 
iommu = rk_iommu_from_dev(dev);
 
-   group = iommu_group_get_for_dev(dev);
-   if (IS_ERR(group))
-   return PTR_ERR(group);
-   iommu_group_put(group);
-
-   iommu_device_link(>iommu, dev);
data->link = device_link_add(dev, iommu->dev,
 DL_FLAG_STATELESS | DL_FLAG_PM_RUNTIME);
 
-   return 0;
+   return >iommu;
 }
 
-static void rk_iommu_remove_device(struct device *dev)
+static void rk_iommu_release_device(struct device *dev)
 {
-   struct rk_iommu *iommu;
struct rk_iommudata *data = dev->archdata.iommu;
 
-   iommu = rk_iommu_from_dev(dev);
-
device_link_del(data->link);
-   iommu_device_unlink(>iommu, dev);
-   iommu_group_remove_device(dev);
 }
 
 static struct iommu_group *rk_iommu_device_group(struct device *dev)
@@ -1126,8 +1114,8 @@ static const struct iommu_ops rk_iommu_ops = {
.detach_dev = rk_iommu_detach_device,
.map = rk_iommu_map,
.unmap = rk_iommu_unmap,
-   .add_device = rk_iommu_add_device,
-   .remove_device = rk_iommu_remove_device,
+   .probe_device = rk_iommu_probe_device,
+   .release_device = rk_iommu_release_device,
.iova_to_phys = rk_iommu_iova_to_phys,
.device_group = rk_iommu_device_group,
.pgsize_bitmap = RK_IOMMU_PGSIZE_BITMAP,
-- 
2.17.1

___
iommu mailing list
iommu@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/iommu


[PATCH 20/29] gpu/drm: remove the powerpc hack in drm_legacy_sg_alloc

2020-04-14 Thread Christoph Hellwig
The non-cached vmalloc mapping was initially added as a hack for the
first-gen amigaone platform (6xx/book32s), isn't fully supported
upstream, and which used the legacy radeon driver together with
non-coherent DMA. However this only ever worked reliably for DRI .

Remove the hack as it is the last user of __vmalloc passing a page
protection flag other than PAGE_KERNEL and didn't do anything for
other platforms with non-coherent DMA.

Signed-off-by: Christoph Hellwig 
Acked-by: Daniel Vetter 
Acked-by: Peter Zijlstra (Intel) 
---
 drivers/gpu/drm/drm_scatter.c | 11 +--
 1 file changed, 1 insertion(+), 10 deletions(-)

diff --git a/drivers/gpu/drm/drm_scatter.c b/drivers/gpu/drm/drm_scatter.c
index ca520028b2cb..f4e6184d1877 100644
--- a/drivers/gpu/drm/drm_scatter.c
+++ b/drivers/gpu/drm/drm_scatter.c
@@ -43,15 +43,6 @@
 
 #define DEBUG_SCATTER 0
 
-static inline void *drm_vmalloc_dma(unsigned long size)
-{
-#if defined(__powerpc__) && defined(CONFIG_NOT_COHERENT_CACHE)
-   return __vmalloc(size, GFP_KERNEL, pgprot_noncached_wc(PAGE_KERNEL));
-#else
-   return vmalloc_32(size);
-#endif
-}
-
 static void drm_sg_cleanup(struct drm_sg_mem * entry)
 {
struct page *page;
@@ -126,7 +117,7 @@ int drm_legacy_sg_alloc(struct drm_device *dev, void *data,
return -ENOMEM;
}
 
-   entry->virtual = drm_vmalloc_dma(pages << PAGE_SHIFT);
+   entry->virtual = vmalloc_32(pages << PAGE_SHIFT);
if (!entry->virtual) {
kfree(entry->busaddr);
kfree(entry->pagelist);
-- 
2.25.1

___
iommu mailing list
iommu@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/iommu


[PATCH v2 26/33] iommu/tegra: Convert to probe/release_device() call-backs

2020-04-14 Thread Joerg Roedel
From: Joerg Roedel 

Convert the Tegra IOMMU drivers to use the probe_device() and
release_device() call-backs of iommu_ops, so that the iommu core code
does the group and sysfs setup.

Signed-off-by: Joerg Roedel 
---
 drivers/iommu/tegra-gart.c | 24 ++--
 drivers/iommu/tegra-smmu.c | 31 ---
 2 files changed, 14 insertions(+), 41 deletions(-)

diff --git a/drivers/iommu/tegra-gart.c b/drivers/iommu/tegra-gart.c
index db6559e8336f..5fbdff6ff41a 100644
--- a/drivers/iommu/tegra-gart.c
+++ b/drivers/iommu/tegra-gart.c
@@ -243,28 +243,16 @@ static bool gart_iommu_capable(enum iommu_cap cap)
return false;
 }
 
-static int gart_iommu_add_device(struct device *dev)
+static struct iommu_device *gart_iommu_probe_device(struct device *dev)
 {
-   struct iommu_group *group;
-
if (!dev_iommu_fwspec_get(dev))
-   return -ENODEV;
-
-   group = iommu_group_get_for_dev(dev);
-   if (IS_ERR(group))
-   return PTR_ERR(group);
-
-   iommu_group_put(group);
+   return ERR_PTR(-ENODEV);
 
-   iommu_device_link(_handle->iommu, dev);
-
-   return 0;
+   return _handle->iommu;
 }
 
-static void gart_iommu_remove_device(struct device *dev)
+static void gart_iommu_release_device(struct device *dev)
 {
-   iommu_group_remove_device(dev);
-   iommu_device_unlink(_handle->iommu, dev);
 }
 
 static int gart_iommu_of_xlate(struct device *dev,
@@ -290,8 +278,8 @@ static const struct iommu_ops gart_iommu_ops = {
.domain_free= gart_iommu_domain_free,
.attach_dev = gart_iommu_attach_dev,
.detach_dev = gart_iommu_detach_dev,
-   .add_device = gart_iommu_add_device,
-   .remove_device  = gart_iommu_remove_device,
+   .probe_device   = gart_iommu_probe_device,
+   .release_device = gart_iommu_release_device,
.device_group   = generic_device_group,
.map= gart_iommu_map,
.unmap  = gart_iommu_unmap,
diff --git a/drivers/iommu/tegra-smmu.c b/drivers/iommu/tegra-smmu.c
index 63a147b623e6..7426b7666e2b 100644
--- a/drivers/iommu/tegra-smmu.c
+++ b/drivers/iommu/tegra-smmu.c
@@ -757,11 +757,10 @@ static int tegra_smmu_configure(struct tegra_smmu *smmu, 
struct device *dev,
return 0;
 }
 
-static int tegra_smmu_add_device(struct device *dev)
+static struct iommu_device *tegra_smmu_probe_device(struct device *dev)
 {
struct device_node *np = dev->of_node;
struct tegra_smmu *smmu = NULL;
-   struct iommu_group *group;
struct of_phandle_args args;
unsigned int index = 0;
int err;
@@ -774,7 +773,7 @@ static int tegra_smmu_add_device(struct device *dev)
of_node_put(args.np);
 
if (err < 0)
-   return err;
+   return ERR_PTR(err);
 
/*
 * Only a single IOMMU master interface is currently
@@ -783,8 +782,6 @@ static int tegra_smmu_add_device(struct device *dev)
 */
dev->archdata.iommu = smmu;
 
-   iommu_device_link(>iommu, dev);
-
break;
}
 
@@ -793,26 +790,14 @@ static int tegra_smmu_add_device(struct device *dev)
}
 
if (!smmu)
-   return -ENODEV;
-
-   group = iommu_group_get_for_dev(dev);
-   if (IS_ERR(group))
-   return PTR_ERR(group);
-
-   iommu_group_put(group);
+   return ERR_PTR(-ENODEV);
 
-   return 0;
+   return >iommu;
 }
 
-static void tegra_smmu_remove_device(struct device *dev)
+static void tegra_smmu_release_device(struct device *dev)
 {
-   struct tegra_smmu *smmu = dev->archdata.iommu;
-
-   if (smmu)
-   iommu_device_unlink(>iommu, dev);
-
dev->archdata.iommu = NULL;
-   iommu_group_remove_device(dev);
 }
 
 static const struct tegra_smmu_group_soc *
@@ -895,8 +880,8 @@ static const struct iommu_ops tegra_smmu_ops = {
.domain_free = tegra_smmu_domain_free,
.attach_dev = tegra_smmu_attach_dev,
.detach_dev = tegra_smmu_detach_dev,
-   .add_device = tegra_smmu_add_device,
-   .remove_device = tegra_smmu_remove_device,
+   .probe_device = tegra_smmu_probe_device,
+   .release_device = tegra_smmu_release_device,
.device_group = tegra_smmu_device_group,
.map = tegra_smmu_map,
.unmap = tegra_smmu_unmap,
@@ -1015,7 +1000,7 @@ struct tegra_smmu *tegra_smmu_probe(struct device *dev,
 * value. However the IOMMU registration process will attempt to add
 * all devices to the IOMMU when bus_set_iommu() is called. In order
 * not to rely on global variables to track the IOMMU instance, we
-* set it here so that it can be looked up from the .add_device()
+* set it here so that it can be looked up from the 

[PATCH v2 27/33] iommu/renesas: Convert to probe/release_device() call-backs

2020-04-14 Thread Joerg Roedel
From: Joerg Roedel 

Convert the Renesas IOMMU driver to use the probe_device() and
release_device() call-backs of iommu_ops, so that the iommu core code
does the group and sysfs setup.

Signed-off-by: Joerg Roedel 
---
 drivers/iommu/ipmmu-vmsa.c | 60 +-
 1 file changed, 20 insertions(+), 40 deletions(-)

diff --git a/drivers/iommu/ipmmu-vmsa.c b/drivers/iommu/ipmmu-vmsa.c
index 310cf09feea3..fb7e702dee23 100644
--- a/drivers/iommu/ipmmu-vmsa.c
+++ b/drivers/iommu/ipmmu-vmsa.c
@@ -805,24 +805,8 @@ static int ipmmu_of_xlate(struct device *dev,
 static int ipmmu_init_arm_mapping(struct device *dev)
 {
struct ipmmu_vmsa_device *mmu = to_ipmmu(dev);
-   struct iommu_group *group;
int ret;
 
-   /* Create a device group and add the device to it. */
-   group = iommu_group_alloc();
-   if (IS_ERR(group)) {
-   dev_err(dev, "Failed to allocate IOMMU group\n");
-   return PTR_ERR(group);
-   }
-
-   ret = iommu_group_add_device(group, dev);
-   iommu_group_put(group);
-
-   if (ret < 0) {
-   dev_err(dev, "Failed to add device to IPMMU group\n");
-   return ret;
-   }
-
/*
 * Create the ARM mapping, used by the ARM DMA mapping core to allocate
 * VAs. This will allocate a corresponding IOMMU domain.
@@ -856,48 +840,39 @@ static int ipmmu_init_arm_mapping(struct device *dev)
return 0;
 
 error:
-   iommu_group_remove_device(dev);
if (mmu->mapping)
arm_iommu_release_mapping(mmu->mapping);
 
return ret;
 }
 
-static int ipmmu_add_device(struct device *dev)
+static struct iommu_device *ipmmu_probe_device(struct device *dev)
 {
struct ipmmu_vmsa_device *mmu = to_ipmmu(dev);
-   struct iommu_group *group;
-   int ret;
 
/*
 * Only let through devices that have been verified in xlate()
 */
if (!mmu)
-   return -ENODEV;
+   return ERR_PTR(-ENODEV);
 
-   if (IS_ENABLED(CONFIG_ARM) && !IS_ENABLED(CONFIG_IOMMU_DMA)) {
-   ret = ipmmu_init_arm_mapping(dev);
-   if (ret)
-   return ret;
-   } else {
-   group = iommu_group_get_for_dev(dev);
-   if (IS_ERR(group))
-   return PTR_ERR(group);
+   return >iommu;
+}
 
-   iommu_group_put(group);
-   }
+static void ipmmu_probe_finalize(struct device *dev)
+{
+   int ret = 0;
 
-   iommu_device_link(>iommu, dev);
-   return 0;
+   if (IS_ENABLED(CONFIG_ARM) && !IS_ENABLED(CONFIG_IOMMU_DMA))
+   ret = ipmmu_init_arm_mapping(dev);
+
+   if (ret)
+   dev_err(dev, "Can't create IOMMU mapping - DMA-OPS will not 
work\n");
 }
 
-static void ipmmu_remove_device(struct device *dev)
+static void ipmmu_release_device(struct device *dev)
 {
-   struct ipmmu_vmsa_device *mmu = to_ipmmu(dev);
-
-   iommu_device_unlink(>iommu, dev);
arm_iommu_detach_device(dev);
-   iommu_group_remove_device(dev);
 }
 
 static struct iommu_group *ipmmu_find_group(struct device *dev)
@@ -925,9 +900,14 @@ static const struct iommu_ops ipmmu_ops = {
.flush_iotlb_all = ipmmu_flush_iotlb_all,
.iotlb_sync = ipmmu_iotlb_sync,
.iova_to_phys = ipmmu_iova_to_phys,
-   .add_device = ipmmu_add_device,
-   .remove_device = ipmmu_remove_device,
+   .probe_device = ipmmu_probe_device,
+   .release_device = ipmmu_release_device,
+   .probe_finalize = ipmmu_probe_finalize,
+#if defined(CONFIG_ARM) && !defined(CONFIG_IOMMU_DMA)
+   .device_group = generic_device_group,
+#else
.device_group = ipmmu_find_group,
+#endif
.pgsize_bitmap = SZ_1G | SZ_2M | SZ_4K,
.of_xlate = ipmmu_of_xlate,
 };
-- 
2.17.1

___
iommu mailing list
iommu@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/iommu


[PATCH v2 21/33] iommu/msm: Convert to probe/release_device() call-backs

2020-04-14 Thread Joerg Roedel
From: Joerg Roedel 

Convert the MSM IOMMU driver to use the probe_device() and
release_device() call-backs of iommu_ops, so that the iommu core code
does the group and sysfs setup.

Signed-off-by: Joerg Roedel 
---
 drivers/iommu/msm_iommu.c | 34 +++---
 1 file changed, 7 insertions(+), 27 deletions(-)

diff --git a/drivers/iommu/msm_iommu.c b/drivers/iommu/msm_iommu.c
index 94a6df1bddd6..10cd4db0710a 100644
--- a/drivers/iommu/msm_iommu.c
+++ b/drivers/iommu/msm_iommu.c
@@ -388,43 +388,23 @@ static struct msm_iommu_dev *find_iommu_for_dev(struct 
device *dev)
return ret;
 }
 
-static int msm_iommu_add_device(struct device *dev)
+static struct iommu_device *msm_iommu_probe_device(struct device *dev)
 {
struct msm_iommu_dev *iommu;
-   struct iommu_group *group;
unsigned long flags;
 
spin_lock_irqsave(_iommu_lock, flags);
iommu = find_iommu_for_dev(dev);
spin_unlock_irqrestore(_iommu_lock, flags);
 
-   if (iommu)
-   iommu_device_link(>iommu, dev);
-   else
-   return -ENODEV;
-
-   group = iommu_group_get_for_dev(dev);
-   if (IS_ERR(group))
-   return PTR_ERR(group);
-
-   iommu_group_put(group);
+   if (!iommu)
+   return ERR_PTR(-ENODEV);
 
-   return 0;
+   return >iommu;
 }
 
-static void msm_iommu_remove_device(struct device *dev)
+static void msm_iommu_release_device(struct device *dev)
 {
-   struct msm_iommu_dev *iommu;
-   unsigned long flags;
-
-   spin_lock_irqsave(_iommu_lock, flags);
-   iommu = find_iommu_for_dev(dev);
-   spin_unlock_irqrestore(_iommu_lock, flags);
-
-   if (iommu)
-   iommu_device_unlink(>iommu, dev);
-
-   iommu_group_remove_device(dev);
 }
 
 static int msm_iommu_attach_dev(struct iommu_domain *domain, struct device 
*dev)
@@ -708,8 +688,8 @@ static struct iommu_ops msm_iommu_ops = {
 */
.iotlb_sync = NULL,
.iova_to_phys = msm_iommu_iova_to_phys,
-   .add_device = msm_iommu_add_device,
-   .remove_device = msm_iommu_remove_device,
+   .probe_device = msm_iommu_probe_device,
+   .release_device = msm_iommu_release_device,
.device_group = generic_device_group,
.pgsize_bitmap = MSM_IOMMU_PGSIZES,
.of_xlate = qcom_iommu_of_xlate,
-- 
2.17.1

___
iommu mailing list
iommu@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/iommu


[PATCH v2 04/33] iommu/vt-d: Wire up iommu_ops->def_domain_type

2020-04-14 Thread Joerg Roedel
From: Joerg Roedel 

The Intel VT-d driver already has a matching function to determine the
default domain type for a device. Wire it up in intel_iommu_ops.

Signed-off-by: Joerg Roedel 
---
 drivers/iommu/intel-iommu.c | 1 +
 1 file changed, 1 insertion(+)

diff --git a/drivers/iommu/intel-iommu.c b/drivers/iommu/intel-iommu.c
index ef0a5246700e..b9f905a55dda 100644
--- a/drivers/iommu/intel-iommu.c
+++ b/drivers/iommu/intel-iommu.c
@@ -6209,6 +6209,7 @@ const struct iommu_ops intel_iommu_ops = {
.dev_enable_feat= intel_iommu_dev_enable_feat,
.dev_disable_feat   = intel_iommu_dev_disable_feat,
.is_attach_deferred = intel_iommu_is_attach_deferred,
+   .def_domain_type= device_def_domain_type,
.pgsize_bitmap  = INTEL_IOMMU_PGSIZES,
 };
 
-- 
2.17.1

___
iommu mailing list
iommu@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/iommu


[PATCH v2 12/33] iommu: Move iommu_group_create_direct_mappings() out of iommu_group_add_device()

2020-04-14 Thread Joerg Roedel
From: Joerg Roedel 

After the previous changes the iommu group may not have a default
domain when iommu_group_add_device() is called. With no default domain
iommu_group_create_direct_mappings() will do nothing and no direct
mappings will be created.

Rename iommu_group_create_direct_mappings() to
iommu_create_device_direct_mappings() to better reflect that the
function creates direct mappings only for one device and not for all
devices in the group. Then move the call to the places where a default
domain actually exists.

Signed-off-by: Joerg Roedel 
---
 drivers/iommu/iommu.c | 35 ++-
 1 file changed, 30 insertions(+), 5 deletions(-)

diff --git a/drivers/iommu/iommu.c b/drivers/iommu/iommu.c
index 7de0e29db333..834a45da0ed0 100644
--- a/drivers/iommu/iommu.c
+++ b/drivers/iommu/iommu.c
@@ -89,6 +89,8 @@ static int __iommu_attach_group(struct iommu_domain *domain,
struct iommu_group *group);
 static void __iommu_detach_group(struct iommu_domain *domain,
 struct iommu_group *group);
+static int iommu_create_device_direct_mappings(struct iommu_group *group,
+  struct device *dev);
 
 #define IOMMU_GROUP_ATTR(_name, _mode, _show, _store)  \
 struct iommu_group_attribute iommu_group_attr_##_name =\
@@ -243,6 +245,8 @@ static int __iommu_probe_device_helper(struct device *dev)
if (group->default_domain)
ret = __iommu_attach_device(group->default_domain, dev);
 
+   iommu_create_device_direct_mappings(group, dev);
+
iommu_group_put(group);
 
if (ret)
@@ -263,6 +267,7 @@ static int __iommu_probe_device_helper(struct device *dev)
 int iommu_probe_device(struct device *dev)
 {
const struct iommu_ops *ops = dev->bus->iommu_ops;
+   struct iommu_group *group;
int ret;
 
WARN_ON(dev->iommu_group);
@@ -285,6 +290,10 @@ int iommu_probe_device(struct device *dev)
if (ret)
goto err_module_put;
 
+   group = iommu_group_get(dev);
+   iommu_create_device_direct_mappings(group, dev);
+   iommu_group_put(group);
+
if (ops->probe_finalize)
ops->probe_finalize(dev);
 
@@ -736,8 +745,8 @@ int iommu_group_set_name(struct iommu_group *group, const 
char *name)
 }
 EXPORT_SYMBOL_GPL(iommu_group_set_name);
 
-static int iommu_group_create_direct_mappings(struct iommu_group *group,
- struct device *dev)
+static int iommu_create_device_direct_mappings(struct iommu_group *group,
+  struct device *dev)
 {
struct iommu_domain *domain = group->default_domain;
struct iommu_resv_region *entry;
@@ -841,8 +850,6 @@ int iommu_group_add_device(struct iommu_group *group, 
struct device *dev)
 
dev->iommu_group = group;
 
-   iommu_group_create_direct_mappings(group, dev);
-
mutex_lock(>mutex);
list_add_tail(>list, >devices);
if (group->domain)
@@ -1736,6 +1743,7 @@ static void probe_alloc_default_domain(struct bus_type 
*bus,
gtype.type = iommu_def_domain_type;
 
iommu_group_alloc_default_domain(bus, group, gtype.type);
+
 }
 
 static int iommu_group_do_dma_attach(struct device *dev, void *data)
@@ -1760,6 +1768,21 @@ static int __iommu_group_dma_attach(struct iommu_group 
*group)
  iommu_group_do_dma_attach);
 }
 
+static int iommu_do_create_direct_mappings(struct device *dev, void *data)
+{
+   struct iommu_group *group = data;
+
+   iommu_create_device_direct_mappings(group, dev);
+
+   return 0;
+}
+
+static int iommu_group_create_direct_mappings(struct iommu_group *group)
+{
+   return __iommu_group_for_each_dev(group, group,
+ iommu_do_create_direct_mappings);
+}
+
 static int bus_iommu_probe(struct bus_type *bus)
 {
const struct iommu_ops *ops = bus->iommu_ops;
@@ -1792,6 +1815,8 @@ static int bus_iommu_probe(struct bus_type *bus)
continue;
}
 
+   iommu_group_create_direct_mappings(group);
+
ret = __iommu_group_dma_attach(group);
 
mutex_unlock(>mutex);
@@ -2632,7 +2657,7 @@ request_default_domain_for_dev(struct device *dev, 
unsigned long type)
iommu_domain_free(group->default_domain);
group->default_domain = domain;
 
-   iommu_group_create_direct_mappings(group, dev);
+   iommu_create_device_direct_mappings(group, dev);
 
dev_info(dev, "Using iommu %s mapping\n",
 type == IOMMU_DOMAIN_DMA ? "dma" : "direct");
-- 
2.17.1

___
iommu mailing list
iommu@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/iommu


[PATCH v2 14/33] iommu/amd: Remove dev_data->passthrough

2020-04-14 Thread Joerg Roedel
From: Joerg Roedel 

Make use of generic IOMMU infrastructure to gather the same information
carried in dev_data->passthrough and remove the struct member.

Signed-off-by: Joerg Roedel 
---
 drivers/iommu/amd_iommu.c   | 10 +-
 drivers/iommu/amd_iommu_types.h |  1 -
 2 files changed, 5 insertions(+), 6 deletions(-)

diff --git a/drivers/iommu/amd_iommu.c b/drivers/iommu/amd_iommu.c
index 3e0d27f7622e..0b4b4faa876d 100644
--- a/drivers/iommu/amd_iommu.c
+++ b/drivers/iommu/amd_iommu.c
@@ -2047,8 +2047,8 @@ static int pdev_iommuv2_enable(struct pci_dev *pdev)
 static int attach_device(struct device *dev,
 struct protection_domain *domain)
 {
-   struct pci_dev *pdev;
struct iommu_dev_data *dev_data;
+   struct pci_dev *pdev;
unsigned long flags;
int ret;
 
@@ -2067,8 +2067,10 @@ static int attach_device(struct device *dev,
 
pdev = to_pci_dev(dev);
if (domain->flags & PD_IOMMUV2_MASK) {
+   struct iommu_domain *def_domain = iommu_get_dma_domain(dev);
+
ret = -EINVAL;
-   if (!dev_data->passthrough)
+   if (def_domain->type != IOMMU_DOMAIN_IDENTITY)
goto out;
 
if (dev_data->iommu_v2) {
@@ -2189,9 +2191,7 @@ static int amd_iommu_add_device(struct device *dev)
 
/* Domains are initialized for this device - have a look what we ended 
up with */
domain = iommu_get_domain_for_dev(dev);
-   if (domain->type == IOMMU_DOMAIN_IDENTITY)
-   dev_data->passthrough = true;
-   else if (domain->type == IOMMU_DOMAIN_DMA)
+   if (domain->type == IOMMU_DOMAIN_DMA)
iommu_setup_dma_ops(dev, IOVA_START_PFN << PAGE_SHIFT, 0);
 
 out:
diff --git a/drivers/iommu/amd_iommu_types.h b/drivers/iommu/amd_iommu_types.h
index ca8c4522045b..d0d7b6a0c3d8 100644
--- a/drivers/iommu/amd_iommu_types.h
+++ b/drivers/iommu/amd_iommu_types.h
@@ -640,7 +640,6 @@ struct iommu_dev_data {
struct pci_dev *pdev;
u16 devid;/* PCI Device ID */
bool iommu_v2;/* Device can make use of IOMMUv2 */
-   bool passthrough; /* Device is identity mapped */
struct {
bool enabled;
int qdep;
-- 
2.17.1

___
iommu mailing list
iommu@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/iommu


[PATCH v2 31/33] iommu/exynos: Convert to probe/release_device() call-backs

2020-04-14 Thread Joerg Roedel
From: Joerg Roedel 

Convert the Exynos IOMMU driver to use the probe_device() and
release_device() call-backs of iommu_ops, so that the iommu core code
does the group and sysfs setup.

Signed-off-by: Joerg Roedel 
---
 drivers/iommu/exynos-iommu.c | 26 ++
 1 file changed, 6 insertions(+), 20 deletions(-)

diff --git a/drivers/iommu/exynos-iommu.c b/drivers/iommu/exynos-iommu.c
index 09cdd163560a..60c8a56e4a3f 100644
--- a/drivers/iommu/exynos-iommu.c
+++ b/drivers/iommu/exynos-iommu.c
@@ -1235,19 +1235,13 @@ static phys_addr_t exynos_iommu_iova_to_phys(struct 
iommu_domain *iommu_domain,
return phys;
 }
 
-static int exynos_iommu_add_device(struct device *dev)
+static struct iommu_device *exynos_iommu_probe_device(struct device *dev)
 {
struct exynos_iommu_owner *owner = dev->archdata.iommu;
struct sysmmu_drvdata *data;
-   struct iommu_group *group;
 
if (!has_sysmmu(dev))
-   return -ENODEV;
-
-   group = iommu_group_get_for_dev(dev);
-
-   if (IS_ERR(group))
-   return PTR_ERR(group);
+   return ERR_PTR(-ENODEV);
 
list_for_each_entry(data, >controllers, owner_node) {
/*
@@ -1259,17 +1253,15 @@ static int exynos_iommu_add_device(struct device *dev)
 DL_FLAG_STATELESS |
 DL_FLAG_PM_RUNTIME);
}
-   iommu_group_put(group);
 
/* There is always at least one entry, see exynos_iommu_of_xlate() */
data = list_first_entry(>controllers,
struct sysmmu_drvdata, owner_node);
-   iommu_device_link(>iommu, dev);
 
-   return 0;
+   return >iommu;
 }
 
-static void exynos_iommu_remove_device(struct device *dev)
+static void exynos_iommu_release_device(struct device *dev)
 {
struct exynos_iommu_owner *owner = dev->archdata.iommu;
struct sysmmu_drvdata *data;
@@ -1287,15 +1279,9 @@ static void exynos_iommu_remove_device(struct device 
*dev)
iommu_group_put(group);
}
}
-   iommu_group_remove_device(dev);
 
list_for_each_entry(data, >controllers, owner_node)
device_link_del(data->link);
-
-   /* There is always at least one entry, see exynos_iommu_of_xlate() */
-   data = list_first_entry(>controllers,
-   struct sysmmu_drvdata, owner_node);
-   iommu_device_unlink(>iommu, dev);
 }
 
 static int exynos_iommu_of_xlate(struct device *dev,
@@ -1341,8 +1327,8 @@ static const struct iommu_ops exynos_iommu_ops = {
.unmap = exynos_iommu_unmap,
.iova_to_phys = exynos_iommu_iova_to_phys,
.device_group = generic_device_group,
-   .add_device = exynos_iommu_add_device,
-   .remove_device = exynos_iommu_remove_device,
+   .probe_device = exynos_iommu_probe_device,
+   .release_device = exynos_iommu_release_device,
.pgsize_bitmap = SECT_SIZE | LPAGE_SIZE | SPAGE_SIZE,
.of_xlate = exynos_iommu_of_xlate,
 };
-- 
2.17.1

___
iommu mailing list
iommu@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/iommu


[PATCH v2 24/33] iommu/qcom: Convert to probe/release_device() call-backs

2020-04-14 Thread Joerg Roedel
From: Joerg Roedel 

Convert the QCOM IOMMU driver to use the probe_device() and
release_device() call-backs of iommu_ops, so that the iommu core code
does the group and sysfs setup.

Signed-off-by: Joerg Roedel 
---
 drivers/iommu/qcom_iommu.c | 24 +++-
 1 file changed, 7 insertions(+), 17 deletions(-)

diff --git a/drivers/iommu/qcom_iommu.c b/drivers/iommu/qcom_iommu.c
index 0e2a96467767..054e476ebd49 100644
--- a/drivers/iommu/qcom_iommu.c
+++ b/drivers/iommu/qcom_iommu.c
@@ -524,14 +524,13 @@ static bool qcom_iommu_capable(enum iommu_cap cap)
}
 }
 
-static int qcom_iommu_add_device(struct device *dev)
+static struct iommu_device *qcom_iommu_probe_device(struct device *dev)
 {
struct qcom_iommu_dev *qcom_iommu = to_iommu(dev);
-   struct iommu_group *group;
struct device_link *link;
 
if (!qcom_iommu)
-   return -ENODEV;
+   return ERR_PTR(-ENODEV);
 
/*
 * Establish the link between iommu and master, so that the
@@ -542,28 +541,19 @@ static int qcom_iommu_add_device(struct device *dev)
if (!link) {
dev_err(qcom_iommu->dev, "Unable to create device link between 
%s and %s\n",
dev_name(qcom_iommu->dev), dev_name(dev));
-   return -ENODEV;
+   return ERR_PTR(-ENODEV);
}
 
-   group = iommu_group_get_for_dev(dev);
-   if (IS_ERR(group))
-   return PTR_ERR(group);
-
-   iommu_group_put(group);
-   iommu_device_link(_iommu->iommu, dev);
-
-   return 0;
+   return _iommu->iommu;
 }
 
-static void qcom_iommu_remove_device(struct device *dev)
+static void qcom_iommu_release_device(struct device *dev)
 {
struct qcom_iommu_dev *qcom_iommu = to_iommu(dev);
 
if (!qcom_iommu)
return;
 
-   iommu_device_unlink(_iommu->iommu, dev);
-   iommu_group_remove_device(dev);
iommu_fwspec_free(dev);
 }
 
@@ -619,8 +609,8 @@ static const struct iommu_ops qcom_iommu_ops = {
.flush_iotlb_all = qcom_iommu_flush_iotlb_all,
.iotlb_sync = qcom_iommu_iotlb_sync,
.iova_to_phys   = qcom_iommu_iova_to_phys,
-   .add_device = qcom_iommu_add_device,
-   .remove_device  = qcom_iommu_remove_device,
+   .probe_device   = qcom_iommu_probe_device,
+   .release_device = qcom_iommu_release_device,
.device_group   = generic_device_group,
.of_xlate   = qcom_iommu_of_xlate,
.pgsize_bitmap  = SZ_4K | SZ_64K | SZ_1M | SZ_16M,
-- 
2.17.1

___
iommu mailing list
iommu@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/iommu


[PATCH v2 30/33] iommu/exynos: Use first SYSMMU in controllers list for IOMMU core

2020-04-14 Thread Joerg Roedel
From: Joerg Roedel 

On Exynos platforms there can be more than one SYSMMU (IOMMU) for one
DMA master device. Since the IOMMU core code expects only one hardware
IOMMU, use the first SYSMMU in the list.

Signed-off-by: Joerg Roedel 
---
 drivers/iommu/exynos-iommu.c | 10 ++
 1 file changed, 10 insertions(+)

diff --git a/drivers/iommu/exynos-iommu.c b/drivers/iommu/exynos-iommu.c
index 186ff5cc975c..09cdd163560a 100644
--- a/drivers/iommu/exynos-iommu.c
+++ b/drivers/iommu/exynos-iommu.c
@@ -1261,6 +1261,11 @@ static int exynos_iommu_add_device(struct device *dev)
}
iommu_group_put(group);
 
+   /* There is always at least one entry, see exynos_iommu_of_xlate() */
+   data = list_first_entry(>controllers,
+   struct sysmmu_drvdata, owner_node);
+   iommu_device_link(>iommu, dev);
+
return 0;
 }
 
@@ -1286,6 +1291,11 @@ static void exynos_iommu_remove_device(struct device 
*dev)
 
list_for_each_entry(data, >controllers, owner_node)
device_link_del(data->link);
+
+   /* There is always at least one entry, see exynos_iommu_of_xlate() */
+   data = list_first_entry(>controllers,
+   struct sysmmu_drvdata, owner_node);
+   iommu_device_unlink(>iommu, dev);
 }
 
 static int exynos_iommu_of_xlate(struct device *dev,
-- 
2.17.1

___
iommu mailing list
iommu@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/iommu


[PATCH v2 02/33] iommu: Add def_domain_type() callback in iommu_ops

2020-04-14 Thread Joerg Roedel
From: Sai Praneeth Prakhya 

Some devices are reqired to use a specific type (identity or dma)
of default domain when they are used with a vendor iommu. When the
system level default domain type is different from it, the vendor
iommu driver has to request a new default domain with
iommu_request_dma_domain_for_dev() and iommu_request_dm_for_dev()
in the add_dev() callback. Unfortunately, these two helpers only
work when the group hasn't been assigned to any other devices,
hence, some vendor iommu driver has to use a private domain if
it fails to request a new default one.

This adds def_domain_type() callback in the iommu_ops, so that
any special requirement of default domain for a device could be
aware by the iommu generic layer.

Signed-off-by: Sai Praneeth Prakhya 
Signed-off-by: Lu Baolu 
[ jroe...@suse.de: Added iommu_get_def_domain_type() function and use
   it to allocate the default domain ]
Co-developed-by: Joerg Roedel 
Signed-off-by: Joerg Roedel 
---
 drivers/iommu/iommu.c | 20 +---
 include/linux/iommu.h |  6 ++
 2 files changed, 23 insertions(+), 3 deletions(-)

diff --git a/drivers/iommu/iommu.c b/drivers/iommu/iommu.c
index bfe011760ed1..5877abd9b693 100644
--- a/drivers/iommu/iommu.c
+++ b/drivers/iommu/iommu.c
@@ -1361,21 +1361,35 @@ struct iommu_group *fsl_mc_device_group(struct device 
*dev)
 }
 EXPORT_SYMBOL_GPL(fsl_mc_device_group);
 
+static int iommu_get_def_domain_type(struct device *dev)
+{
+   const struct iommu_ops *ops = dev->bus->iommu_ops;
+   unsigned int type = 0;
+
+   if (ops->def_domain_type)
+   type = ops->def_domain_type(dev);
+
+   return (type == 0) ? iommu_def_domain_type : type;
+}
+
 static int iommu_alloc_default_domain(struct device *dev,
  struct iommu_group *group)
 {
struct iommu_domain *dom;
+   unsigned int type;
 
if (group->default_domain)
return 0;
 
-   dom = __iommu_domain_alloc(dev->bus, iommu_def_domain_type);
-   if (!dom && iommu_def_domain_type != IOMMU_DOMAIN_DMA) {
+   type = iommu_get_def_domain_type(dev);
+
+   dom = __iommu_domain_alloc(dev->bus, type);
+   if (!dom && type != IOMMU_DOMAIN_DMA) {
dom = __iommu_domain_alloc(dev->bus, IOMMU_DOMAIN_DMA);
if (dom) {
dev_warn(dev,
 "failed to allocate default IOMMU domain of 
type %u; falling back to IOMMU_DOMAIN_DMA",
-iommu_def_domain_type);
+type);
}
}
 
diff --git a/include/linux/iommu.h b/include/linux/iommu.h
index 7ef8b0bda695..1f027b07e499 100644
--- a/include/linux/iommu.h
+++ b/include/linux/iommu.h
@@ -248,6 +248,10 @@ struct iommu_iotlb_gather {
  * @cache_invalidate: invalidate translation caches
  * @sva_bind_gpasid: bind guest pasid and mm
  * @sva_unbind_gpasid: unbind guest pasid and mm
+ * @def_domain_type: device default domain type, return value:
+ * - IOMMU_DOMAIN_IDENTITY: must use an identity domain
+ * - IOMMU_DOMAIN_DMA: must use a dma domain
+ * - 0: use the default setting
  * @pgsize_bitmap: bitmap of all possible supported page sizes
  * @owner: Driver module providing these ops
  */
@@ -318,6 +322,8 @@ struct iommu_ops {
 
int (*sva_unbind_gpasid)(struct device *dev, int pasid);
 
+   int (*def_domain_type)(struct device *dev);
+
unsigned long pgsize_bitmap;
struct module *owner;
 };
-- 
2.17.1

___
iommu mailing list
iommu@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/iommu


[PATCH 17/29] mm: remove unmap_vmap_area

2020-04-14 Thread Christoph Hellwig
This function just has a single caller, open code it there.

Signed-off-by: Christoph Hellwig 
Acked-by: Peter Zijlstra (Intel) 
---
 mm/vmalloc.c | 10 +-
 1 file changed, 1 insertion(+), 9 deletions(-)

diff --git a/mm/vmalloc.c b/mm/vmalloc.c
index b0c7cdc8701a..258220b203f1 100644
--- a/mm/vmalloc.c
+++ b/mm/vmalloc.c
@@ -1247,14 +1247,6 @@ int unregister_vmap_purge_notifier(struct notifier_block 
*nb)
 }
 EXPORT_SYMBOL_GPL(unregister_vmap_purge_notifier);
 
-/*
- * Clear the pagetable entries of a given vmap_area
- */
-static void unmap_vmap_area(struct vmap_area *va)
-{
-   unmap_kernel_range_noflush(va->va_start, va->va_end - va->va_start);
-}
-
 /*
  * lazy_max_pages is the maximum amount of virtual address space we gather up
  * before attempting to purge with a TLB flush.
@@ -1416,7 +1408,7 @@ static void free_vmap_area_noflush(struct vmap_area *va)
 static void free_unmap_vmap_area(struct vmap_area *va)
 {
flush_cache_vunmap(va->va_start, va->va_end);
-   unmap_vmap_area(va);
+   unmap_kernel_range_noflush(va->va_start, va->va_end - va->va_start);
if (debug_pagealloc_enabled_static())
flush_tlb_kernel_range(va->va_start, va->va_end);
 
-- 
2.25.1

___
iommu mailing list
iommu@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/iommu


  1   2   >