from:"Jacob Pan"

[PATCH v3 1/2] iommu/sva: Tighten SVA bind API with explicit flags

2021-04-16 Thread Jacob Pan

The void* drvdata parameter isn't really used in iommu_sva_bind_device()
API, the current IDXD code "borrows" the drvdata for a VT-d private flag
for supervisor SVA usage.

Supervisor/Privileged mode request is a generic feature. It should be
promoted from the VT-d vendor driver to the generic code.

This patch replaces void* drvdata with a unsigned int flags parameter
and adjusts callers accordingly.

Link: https://lore.kernel.org/linux-iommu/YFhiMLR35WWMW%2FHu@myrica/
Suggested-by: Jean-Philippe Brucker 
Signed-off-by: Jacob Pan 
---
 drivers/dma/idxd/cdev.c   |  2 +-
 drivers/dma/idxd/init.c   |  7 ++-
 .../iommu/arm/arm-smmu-v3/arm-smmu-v3-sva.c   |  5 -
 drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.h   |  4 ++--
 drivers/iommu/intel/svm.c | 14 --
 drivers/iommu/iommu.c |  9 ++---
 drivers/misc/uacce/uacce.c|  2 +-
 include/linux/intel-iommu.h   |  2 +-
 include/linux/intel-svm.h | 17 ++---
 include/linux/iommu.h | 19 ---
 10 files changed, 39 insertions(+), 42 deletions(-)

diff --git a/drivers/dma/idxd/cdev.c b/drivers/dma/idxd/cdev.c
index 0db9b82ed8cf..21ec82bc47b6 100644
--- a/drivers/dma/idxd/cdev.c
+++ b/drivers/dma/idxd/cdev.c
@@ -103,7 +103,7 @@ static int idxd_cdev_open(struct inode *inode, struct file 
*filp)
filp->private_data = ctx;
 
if (device_pasid_enabled(idxd)) {
-   sva = iommu_sva_bind_device(dev, current->mm, NULL);
+   sva = iommu_sva_bind_device(dev, current->mm, 0);
if (IS_ERR(sva)) {
rc = PTR_ERR(sva);
dev_err(dev, "pasid allocation failed: %d\n", rc);
diff --git a/drivers/dma/idxd/init.c b/drivers/dma/idxd/init.c
index 085a0c3b62c6..7b2290b19787 100644
--- a/drivers/dma/idxd/init.c
+++ b/drivers/dma/idxd/init.c
@@ -14,7 +14,6 @@
 #include 
 #include 
 #include 
-#include 
 #include 
 #include 
 #include 
@@ -300,13 +299,11 @@ static struct idxd_device *idxd_alloc(struct pci_dev 
*pdev)
 
 static int idxd_enable_system_pasid(struct idxd_device *idxd)
 {
-   int flags;
unsigned int pasid;
struct iommu_sva *sva;
 
-   flags = SVM_FLAG_SUPERVISOR_MODE;
-
-   sva = iommu_sva_bind_device(>pdev->dev, NULL, );
+   sva = iommu_sva_bind_device(>pdev->dev, NULL,
+   IOMMU_SVA_BIND_SUPERVISOR);
if (IS_ERR(sva)) {
dev_warn(>pdev->dev,
 "iommu sva bind failed: %ld\n", PTR_ERR(sva));
diff --git a/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3-sva.c 
b/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3-sva.c
index bb251cab61f3..145ceb2fc5da 100644
--- a/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3-sva.c
+++ b/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3-sva.c
@@ -354,12 +354,15 @@ __arm_smmu_sva_bind(struct device *dev, struct mm_struct 
*mm)
 }
 
 struct iommu_sva *
-arm_smmu_sva_bind(struct device *dev, struct mm_struct *mm, void *drvdata)
+arm_smmu_sva_bind(struct device *dev, struct mm_struct *mm, unsigned int flags)
 {
struct iommu_sva *handle;
struct iommu_domain *domain = iommu_get_domain_for_dev(dev);
struct arm_smmu_domain *smmu_domain = to_smmu_domain(domain);
 
+   if (flags)
+   return ERR_PTR(-EINVAL);
+
if (smmu_domain->stage != ARM_SMMU_DOMAIN_S1)
return ERR_PTR(-EINVAL);
 
diff --git a/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.h 
b/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.h
index f985817c967a..b971d4dcf090 100644
--- a/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.h
+++ b/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.h
@@ -711,7 +711,7 @@ bool arm_smmu_master_sva_enabled(struct arm_smmu_master 
*master);
 int arm_smmu_master_enable_sva(struct arm_smmu_master *master);
 int arm_smmu_master_disable_sva(struct arm_smmu_master *master);
 struct iommu_sva *arm_smmu_sva_bind(struct device *dev, struct mm_struct *mm,
-   void *drvdata);
+   unsigned int flags);
 void arm_smmu_sva_unbind(struct iommu_sva *handle);
 u32 arm_smmu_sva_get_pasid(struct iommu_sva *handle);
 void arm_smmu_sva_notifier_synchronize(void);
@@ -742,7 +742,7 @@ static inline int arm_smmu_master_disable_sva(struct 
arm_smmu_master *master)
 }
 
 static inline struct iommu_sva *
-arm_smmu_sva_bind(struct device *dev, struct mm_struct *mm, void *drvdata)
+arm_smmu_sva_bind(struct device *dev, struct mm_struct *mm, unsigned int flags)
 {
return ERR_PTR(-ENODEV);
 }
diff --git a/drivers/iommu/intel/svm.c b/drivers/iommu/intel/svm.c
index 574a7e657a9a..d4840821f7b5 100644
--- a/drivers/iommu/intel/svm.c
+++ b/drivers/iommu/intel/svm.c
@@ -486,12 +486,9 @@ intel_svm_bind_mm(struct device *dev, unsigned int flags,
}

[PATCH v3 0/2] Simplify and restrict IOMMU SVA APIs

2021-04-16 Thread Jacob Pan

A couple of small changes to simplify and restrict SVA APIs. The motivation
is to make PASID allocation palatable for cgroup consumptions. Misc cgroup
is merged for v5.13, it can be extended for IOASID as another scalar
resource.

I have not tested on ARM platforms due to availability. Would appreciate
if someone could help with the testing on uacce based SVA usages.

Thanks,

Jacob

ChangeLog:
V3  - stop passing mm to sva_bind IOMMU ops, no need to take mm refcount
in the common SVA code.
- deleted flag variable in idxd driver

V2
- retained mm argument in iommu_sva_alloc_pasid()
- keep generic supervisor flag separated from vt-d's SRE
- move flag declaration out of CONFIG_IOMMU_API



Jacob Pan (2):
  iommu/sva: Tighten SVA bind API with explicit flags
  iommu/sva: Remove mm parameter from SVA bind API

 drivers/dma/idxd/cdev.c   |  2 +-
 drivers/dma/idxd/init.c   |  7 ++
 .../iommu/arm/arm-smmu-v3/arm-smmu-v3-sva.c   | 12 ++
 drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.h   |  5 ++--
 drivers/iommu/intel/svm.c | 19 ---
 drivers/iommu/iommu-sva-lib.c | 11 +
 drivers/iommu/iommu-sva-lib.h |  2 +-
 drivers/iommu/iommu.c | 13 +--
 drivers/misc/uacce/uacce.c|  2 +-
 include/linux/intel-iommu.h   |  3 +--
 include/linux/intel-svm.h | 17 ++
 include/linux/iommu.h | 23 ++-
 12 files changed, 56 insertions(+), 60 deletions(-)


base-commit: e49d033bddf5b565044e2abe4241353959bc9120
-- 
2.25.1

[PATCH v3 2/2] iommu/sva: Remove mm parameter from SVA bind API

2021-04-16 Thread Jacob Pan

The mm parameter in iommu_sva_bind_device() is intended for privileged
process perform bind() on behalf of other processes. This use case has
yet to be materialized, let alone potential security implications of
adding kernel hooks without explicit user consent.
In addition, with the agreement that IOASID allocation shall be subject
cgroup limit. It will be inline with misc cgroup proposal if IOASID
allocation as part of the SVA bind is limited to the current task.

Link: https://lore.kernel.org/linux-iommu/20210303160205.151d114e@jacob-builder/
Link: https://lore.kernel.org/linux-iommu/YFhiMLR35WWMW%2FHu@myrica/
Signed-off-by: Jacob Pan 
---
 drivers/dma/idxd/cdev.c |  2 +-
 drivers/dma/idxd/init.c |  2 +-
 drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3-sva.c |  9 +
 drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.h |  3 +--
 drivers/iommu/intel/svm.c   | 17 +++--
 drivers/iommu/iommu-sva-lib.c   | 11 ++-
 drivers/iommu/iommu-sva-lib.h   |  2 +-
 drivers/iommu/iommu.c   | 14 +-
 drivers/misc/uacce/uacce.c  |  2 +-
 include/linux/intel-iommu.h |  3 +--
 include/linux/iommu.h   |  8 +++-
 11 files changed, 36 insertions(+), 37 deletions(-)

diff --git a/drivers/dma/idxd/cdev.c b/drivers/dma/idxd/cdev.c
index 21ec82bc47b6..8c3347c8930c 100644
--- a/drivers/dma/idxd/cdev.c
+++ b/drivers/dma/idxd/cdev.c
@@ -103,7 +103,7 @@ static int idxd_cdev_open(struct inode *inode, struct file 
*filp)
filp->private_data = ctx;
 
if (device_pasid_enabled(idxd)) {
-   sva = iommu_sva_bind_device(dev, current->mm, 0);
+   sva = iommu_sva_bind_device(dev, 0);
if (IS_ERR(sva)) {
rc = PTR_ERR(sva);
dev_err(dev, "pasid allocation failed: %d\n", rc);
diff --git a/drivers/dma/idxd/init.c b/drivers/dma/idxd/init.c
index 7b2290b19787..f64a19b5e513 100644
--- a/drivers/dma/idxd/init.c
+++ b/drivers/dma/idxd/init.c
@@ -302,7 +302,7 @@ static int idxd_enable_system_pasid(struct idxd_device 
*idxd)
unsigned int pasid;
struct iommu_sva *sva;
 
-   sva = iommu_sva_bind_device(>pdev->dev, NULL,
+   sva = iommu_sva_bind_device(>pdev->dev,
IOMMU_SVA_BIND_SUPERVISOR);
if (IS_ERR(sva)) {
dev_warn(>pdev->dev,
diff --git a/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3-sva.c 
b/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3-sva.c
index 145ceb2fc5da..0c3014e64c77 100644
--- a/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3-sva.c
+++ b/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3-sva.c
@@ -305,10 +305,11 @@ static void arm_smmu_mmu_notifier_put(struct 
arm_smmu_mmu_notifier *smmu_mn)
 }
 
 static struct iommu_sva *
-__arm_smmu_sva_bind(struct device *dev, struct mm_struct *mm)
+__arm_smmu_sva_bind(struct device *dev)
 {
int ret;
struct arm_smmu_bond *bond;
+   struct mm_struct *mm = current->mm;
struct arm_smmu_master *master = dev_iommu_priv_get(dev);
struct iommu_domain *domain = iommu_get_domain_for_dev(dev);
struct arm_smmu_domain *smmu_domain = to_smmu_domain(domain);
@@ -329,7 +330,7 @@ __arm_smmu_sva_bind(struct device *dev, struct mm_struct 
*mm)
return ERR_PTR(-ENOMEM);
 
/* Allocate a PASID for this mm if necessary */
-   ret = iommu_sva_alloc_pasid(mm, 1, (1U << master->ssid_bits) - 1);
+   ret = iommu_sva_alloc_pasid(1, (1U << master->ssid_bits) - 1);
if (ret)
goto err_free_bond;
 
@@ -354,7 +355,7 @@ __arm_smmu_sva_bind(struct device *dev, struct mm_struct 
*mm)
 }
 
 struct iommu_sva *
-arm_smmu_sva_bind(struct device *dev, struct mm_struct *mm, unsigned int flags)
+arm_smmu_sva_bind(struct device *dev, unsigned int flags)
 {
struct iommu_sva *handle;
struct iommu_domain *domain = iommu_get_domain_for_dev(dev);
@@ -367,7 +368,7 @@ arm_smmu_sva_bind(struct device *dev, struct mm_struct *mm, 
unsigned int flags)
return ERR_PTR(-EINVAL);
 
mutex_lock(_lock);
-   handle = __arm_smmu_sva_bind(dev, mm);
+   handle = __arm_smmu_sva_bind(dev);
mutex_unlock(_lock);
return handle;
 }
diff --git a/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.h 
b/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.h
index b971d4dcf090..306fa59a9db5 100644
--- a/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.h
+++ b/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.h
@@ -710,8 +710,7 @@ bool arm_smmu_master_sva_supported(struct arm_smmu_master 
*master);
 bool arm_smmu_master_sva_enabled(struct arm_smmu_master *master);
 int arm_smmu_master_enable_sva(struct arm_smmu_master *master);
 int arm_smmu_master_disable_sva(struct arm_smmu_master *master);
-struct iommu_sva *arm

Re: [PATCH V4 05/18] iommu/ioasid: Redefine IOASID set and allocation APIs

2021-04-16 Thread Jacob Pan

Hi Alex,

On Fri, 16 Apr 2021 09:45:47 -0600, Alex Williamson
 wrote:

> On Fri, 16 Apr 2021 06:12:58 -0700
> Jacob Pan  wrote:
> 
> > Hi Jason,
> > 
> > On Thu, 15 Apr 2021 20:07:32 -0300, Jason Gunthorpe 
> > wrote: 
> > > On Thu, Apr 15, 2021 at 03:11:19PM +0200, Auger Eric wrote:
> > > > Hi Jason,
> > > > 
> > > > On 4/1/21 6:03 PM, Jason Gunthorpe wrote:  
> > > > > On Thu, Apr 01, 2021 at 02:08:17PM +, Liu, Yi L wrote:
> > > > >   
> > > > >> DMA page faults are delivered to root-complex via page request
> > > > >> message and it is per-device according to PCIe spec. Page request
> > > > >> handling flow is:
> > > > >>
> > > > >> 1) iommu driver receives a page request from device
> > > > >> 2) iommu driver parses the page request message. Get the
> > > > >> RID,PASID, faulted page and requested permissions etc.
> > > > >> 3) iommu driver triggers fault handler registered by device
> > > > >> driver with iommu_report_device_fault()  
> > > > > 
> > > > > This seems confused.
> > > > > 
> > > > > The PASID should define how to handle the page fault, not the
> > > > > driver. 
> > > > 
> > > > In my series I don't use PASID at all. I am just enabling nested
> > > > stage and the guest uses a single context. I don't allocate any
> > > > user PASID at any point.
> > > > 
> > > > When there is a fault at physical level (a stage 1 fault that
> > > > concerns the guest), this latter needs to be reported and injected
> > > > into the guest. The vfio pci driver registers a fault handler to
> > > > the iommu layer and in that fault handler it fills a circ bugger
> > > > and triggers an eventfd that is listened to by the VFIO-PCI QEMU
> > > > device. this latter retrives the faault from the mmapped circ
> > > > buffer, it knowns which vIOMMU it is attached to, and passes the
> > > > fault to the vIOMMU. Then the vIOMMU triggers and IRQ in the guest.
> > > > 
> > > > We are reusing the existing concepts from VFIO, region, IRQ to do
> > > > that.
> > > > 
> > > > For that use case, would you also use /dev/ioasid?  
> > > 
> > > /dev/ioasid could do all the things you described vfio-pci as doing,
> > > it can even do them the same way you just described.
> > > 
> > > Stated another way, do you plan to duplicate all of this code someday
> > > for vfio-cxl? What about for vfio-platform? ARM SMMU can be hooked to
> > > platform devices, right?
> > > 
> > > I feel what you guys are struggling with is some choice in the iommu
> > > kernel APIs that cause the events to be delivered to the pci_device
> > > owner, not the PASID owner.
> > > 
> > > That feels solvable.
> > > 
> > Perhaps more of a philosophical question for you and Alex. There is no
> > doubt that the direction you guided for /dev/ioasid is a much cleaner
> > one, especially after VDPA emerged as another IOMMU backed framework.  
> 
> I think this statement answers all your remaining questions ;)
> 
> > The question is what do we do with the nested translation features that
> > have been targeting the existing VFIO-IOMMU for the last three years?
> > That predates VDPA. Shall we put a stop marker *after* nested support
> > and say no more extensions for VFIO-IOMMU, new features must be built
> > on this new interface?
> >
> > If we were to close a checkout line for some unforeseen reasons, should
> > we honor the customers already in line for a long time?
> > 
> > This is not a tactic or excuse for not working on the new /dev/ioasid
> > interface. In fact, I believe we can benefit from the lessons learned
> > while completing the existing. This will give confidence to the new
> > interface. Thoughts?  
> 
> I understand a big part of Jason's argument is that we shouldn't be in
> the habit of creating duplicate interfaces, we should create one, well
> designed interfaces to share among multiple subsystems.  As new users
> have emerged, our solution needs to change to a common one rather than
> a VFIO specific one.  The IOMMU uAPI provides an abstraction, but at
> the wrong level, requiring userspace interfaces for each subsystem.
> 
> Luckily the IOMMU uAPI is not really exposed as an actual uAPI, but
> that change

Re: [PATCH V4 05/18] iommu/ioasid: Redefine IOASID set and allocation APIs

2021-04-16 Thread Jacob Pan

Hi Jason,

On Thu, 15 Apr 2021 20:07:32 -0300, Jason Gunthorpe  wrote:

> On Thu, Apr 15, 2021 at 03:11:19PM +0200, Auger Eric wrote:
> > Hi Jason,
> > 
> > On 4/1/21 6:03 PM, Jason Gunthorpe wrote:  
> > > On Thu, Apr 01, 2021 at 02:08:17PM +, Liu, Yi L wrote:
> > >   
> > >> DMA page faults are delivered to root-complex via page request
> > >> message and it is per-device according to PCIe spec. Page request
> > >> handling flow is:
> > >>
> > >> 1) iommu driver receives a page request from device
> > >> 2) iommu driver parses the page request message. Get the RID,PASID,
> > >> faulted page and requested permissions etc.
> > >> 3) iommu driver triggers fault handler registered by device driver
> > >> with iommu_report_device_fault()  
> > > 
> > > This seems confused.
> > > 
> > > The PASID should define how to handle the page fault, not the driver.
> > >  
> > 
> > In my series I don't use PASID at all. I am just enabling nested stage
> > and the guest uses a single context. I don't allocate any user PASID at
> > any point.
> > 
> > When there is a fault at physical level (a stage 1 fault that concerns
> > the guest), this latter needs to be reported and injected into the
> > guest. The vfio pci driver registers a fault handler to the iommu layer
> > and in that fault handler it fills a circ bugger and triggers an eventfd
> > that is listened to by the VFIO-PCI QEMU device. this latter retrives
> > the faault from the mmapped circ buffer, it knowns which vIOMMU it is
> > attached to, and passes the fault to the vIOMMU.
> > Then the vIOMMU triggers and IRQ in the guest.
> > 
> > We are reusing the existing concepts from VFIO, region, IRQ to do that.
> > 
> > For that use case, would you also use /dev/ioasid?  
> 
> /dev/ioasid could do all the things you described vfio-pci as doing,
> it can even do them the same way you just described.
> 
> Stated another way, do you plan to duplicate all of this code someday
> for vfio-cxl? What about for vfio-platform? ARM SMMU can be hooked to
> platform devices, right?
> 
> I feel what you guys are struggling with is some choice in the iommu
> kernel APIs that cause the events to be delivered to the pci_device
> owner, not the PASID owner.
> 
> That feels solvable.
> 
Perhaps more of a philosophical question for you and Alex. There is no
doubt that the direction you guided for /dev/ioasid is a much cleaner one,
especially after VDPA emerged as another IOMMU backed framework.

The question is what do we do with the nested translation features that have
been targeting the existing VFIO-IOMMU for the last three years? That
predates VDPA. Shall we put a stop marker *after* nested support and say no
more extensions for VFIO-IOMMU, new features must be built on this new
interface?

If we were to close a checkout line for some unforeseen reasons, should we
honor the customers already in line for a long time?

This is not a tactic or excuse for not working on the new /dev/ioasid
interface. In fact, I believe we can benefit from the lessons learned while
completing the existing. This will give confidence to the new
interface. Thoughts?

> Jason


Thanks,

Jacob

Re: [PATCH v2 2/2] iommu/sva: Remove mm parameter from SVA bind API

2021-04-15 Thread Jacob Pan

Hi Christoph,

On Thu, 15 Apr 2021 07:44:59 +0100, Christoph Hellwig 
wrote:

> >   *
> >   * Returns 0 on success and < 0 on error.
> > @@ -28,6 +28,9 @@ int iommu_sva_alloc_pasid(struct mm_struct *mm,
> > ioasid_t min, ioasid_t max) int ret = 0;
> > ioasid_t pasid;
> >  
> > +   if (mm != current->mm)
> > +   return -EINVAL;
> > +  
> 
> Why not remove the parameter entirely?
It was removed in my v1 but thought it would be cleaner if we treat
iommu_sva_alloc_pasid() as a leaf function of iommu_sva_bind_device(). Then
we don't have to do get_task_mm() every time. But to your point below, it
is better to get low-level driver handle it.
> 
> > @@ -2989,8 +2990,11 @@ iommu_sva_bind_device(struct device *dev, struct
> > mm_struct *mm, unsigned int fla return ERR_PTR(-ENODEV);
> >  
> > /* Supervisor SVA does not need the current mm */
> > -   if ((flags & IOMMU_SVA_BIND_SUPERVISOR) && mm)
> > -   return ERR_PTR(-EINVAL);
> > +   if (!(flags & IOMMU_SVA_BIND_SUPERVISOR)) {
> > +   mm = get_task_mm(current);
> > +   if (!mm)
> > +   return ERR_PTR(-EINVAL);
> > +   }  
> 
> I don't see why we need the reference.  I think we should just stop
> passing the mm to ->sva_bind and let the low-level driver deal with
> any reference to current->mm where needed.
The mm users reference is just for precaution, in case low level driver use
kthread etc.
I agree it is cleaner to just remove mm here, let the low-level driver deal
with it.
Let me give it a spin.

Thanks,

Jacob

Re: [PATCH v2 1/2] iommu/sva: Tighten SVA bind API with explicit flags

2021-04-15 Thread Jacob Pan

Hi Christoph,

Thanks for the review.

On Thu, 15 Apr 2021 07:40:33 +0100, Christoph Hellwig 
wrote:

> On Wed, Apr 14, 2021 at 08:27:56AM -0700, Jacob Pan wrote:
> >  static int idxd_enable_system_pasid(struct idxd_device *idxd)
> >  {
> > -   int flags;
> > +   unsigned int flags;
> > unsigned int pasid;
> > struct iommu_sva *sva;
> >  
> > -   flags = SVM_FLAG_SUPERVISOR_MODE;
> > +   flags = IOMMU_SVA_BIND_SUPERVISOR;
> >  
> > -   sva = iommu_sva_bind_device(>pdev->dev, NULL, );
> > +   sva = iommu_sva_bind_device(>pdev->dev, NULL, flags);  
> 
> Please also remove the now pointless flags variable.
> 
Good catch.

> > +iommu_sva_bind_device(struct device *dev, struct mm_struct *mm,
> > unsigned int flags)  
> 
> Pleae avoid the pointless overly long line.
> 
> > -#define SVM_FLAG_GUEST_PASID   (1<<3)
> > +#define SVM_FLAG_GUEST_PASID   (1<<2)  
> 
> This flag is entirely unused, please just remove it in a prep patch
> rather than renumbering it.
> 
You are right. The flag was set and intended to be used by the guest IO
page request patches by Baolu.

As you might be aware, we are restructuring the guest SVA uAPI according to
Jason's proposal, can we wait until we have a clear solution? We may
refactor lots of code.

> >  static inline struct iommu_sva *
> > -iommu_sva_bind_device(struct device *dev, struct mm_struct *mm, void
> > *drvdata) +iommu_sva_bind_device(struct device *dev, struct mm_struct
> > *mm, unsigned int flags)  
> 
> Same overy long line here.
This is temporary as the mm parameter will be removed in the next patch.

Thanks,

Jacob

[PATCH v2 2/2] iommu/sva: Remove mm parameter from SVA bind API

2021-04-14 Thread Jacob Pan

The mm parameter in iommu_sva_bind_device() is intended for privileged
process perform bind() on behalf of other processes. This use case has
yet to be materialized, let alone potential security implications of
adding kernel hooks without explicit user consent.
In addition, with the agreement that IOASID allocation shall be subject
cgroup limit. It will be inline with misc cgroup proposal if IOASID
allocation as part of the SVA bind is limited to the current task.

Link: https://lore.kernel.org/linux-iommu/20210303160205.151d114e@jacob-builder/
Link: https://lore.kernel.org/linux-iommu/YFhiMLR35WWMW%2FHu@myrica/
Signed-off-by: Jacob Pan 
---
 drivers/dma/idxd/cdev.c   |  2 +-
 drivers/dma/idxd/init.c   |  2 +-
 drivers/iommu/iommu-sva-lib.c | 11 +++
 drivers/iommu/iommu.c | 20 +---
 drivers/misc/uacce/uacce.c|  2 +-
 include/linux/iommu.h |  3 +--
 6 files changed, 24 insertions(+), 16 deletions(-)

diff --git a/drivers/dma/idxd/cdev.c b/drivers/dma/idxd/cdev.c
index 21ec82bc47b6..8c3347c8930c 100644
--- a/drivers/dma/idxd/cdev.c
+++ b/drivers/dma/idxd/cdev.c
@@ -103,7 +103,7 @@ static int idxd_cdev_open(struct inode *inode, struct file 
*filp)
filp->private_data = ctx;
 
if (device_pasid_enabled(idxd)) {
-   sva = iommu_sva_bind_device(dev, current->mm, 0);
+   sva = iommu_sva_bind_device(dev, 0);
if (IS_ERR(sva)) {
rc = PTR_ERR(sva);
dev_err(dev, "pasid allocation failed: %d\n", rc);
diff --git a/drivers/dma/idxd/init.c b/drivers/dma/idxd/init.c
index 82a0985ad6dc..a92fa625f3b5 100644
--- a/drivers/dma/idxd/init.c
+++ b/drivers/dma/idxd/init.c
@@ -305,7 +305,7 @@ static int idxd_enable_system_pasid(struct idxd_device 
*idxd)
 
flags = IOMMU_SVA_BIND_SUPERVISOR;
 
-   sva = iommu_sva_bind_device(>pdev->dev, NULL, flags);
+   sva = iommu_sva_bind_device(>pdev->dev, flags);
if (IS_ERR(sva)) {
dev_warn(>pdev->dev,
 "iommu sva bind failed: %ld\n", PTR_ERR(sva));
diff --git a/drivers/iommu/iommu-sva-lib.c b/drivers/iommu/iommu-sva-lib.c
index bd41405d34e9..6e3d1a010d47 100644
--- a/drivers/iommu/iommu-sva-lib.c
+++ b/drivers/iommu/iommu-sva-lib.c
@@ -12,13 +12,13 @@ static DECLARE_IOASID_SET(iommu_sva_pasid);
 
 /**
  * iommu_sva_alloc_pasid - Allocate a PASID for the mm
- * @mm: the mm
  * @min: minimum PASID value (inclusive)
  * @max: maximum PASID value (inclusive)
  *
- * Try to allocate a PASID for this mm, or take a reference to the existing one
- * provided it fits within the [@min, @max] range. On success the PASID is
- * available in mm->pasid, and must be released with iommu_sva_free_pasid().
+ * Try to allocate a PASID for the current mm, or take a reference to the
+ * existing one provided it fits within the [@min, @max] range. On success
+ * the PASID is available in the current mm->pasid, and must be released with
+ * iommu_sva_free_pasid().
  * @min must be greater than 0, because 0 indicates an unused mm->pasid.
  *
  * Returns 0 on success and < 0 on error.
@@ -28,6 +28,9 @@ int iommu_sva_alloc_pasid(struct mm_struct *mm, ioasid_t min, 
ioasid_t max)
int ret = 0;
ioasid_t pasid;
 
+   if (mm != current->mm)
+   return -EINVAL;
+
if (min == INVALID_IOASID || max == INVALID_IOASID ||
min == 0 || max < min)
return -EINVAL;
diff --git a/drivers/iommu/iommu.c b/drivers/iommu/iommu.c
index eefa541d8674..5bbc35c395a6 100644
--- a/drivers/iommu/iommu.c
+++ b/drivers/iommu/iommu.c
@@ -23,6 +23,7 @@
 #include 
 #include 
 #include 
+#include 
 #include 
 
 static struct kset *iommu_group_kset;
@@ -2959,15 +2960,14 @@ int iommu_aux_get_pasid(struct iommu_domain *domain, 
struct device *dev)
 EXPORT_SYMBOL_GPL(iommu_aux_get_pasid);
 
 /**
- * iommu_sva_bind_device() - Bind a process address space to a device
+ * iommu_sva_bind_device() - Bind the current process address space to a device
  * @dev: the device
- * @mm: the mm to bind, caller must hold a reference to it
  * @flags: options for the bind operation defined as IOMMU_SVA_BIND_*
  *
  * Create a bond between device and address space, allowing the device to 
access
  * the mm using the returned PASID. If a bond already exists between @device 
and
- * @mm, it is returned and an additional reference is taken. Caller must call
- * iommu_sva_unbind_device() to release each reference.
+ * the current mm, it is returned and an additional reference is taken. Caller
+ * must call iommu_sva_unbind_device() to release each reference.
  *
  * iommu_dev_enable_feature(dev, IOMMU_DEV_FEAT_SVA) must be called first, to
  * initialize the required SVA features.
@@ -2975,9 +2975,10 @@ EXPORT_SYMBOL_GPL(iommu_aux_get_pasid);
  * On error, returns an ERR_PTR value.
  */
 struct iommu_sva *
-iommu_sva_bind_device

[PATCH v2 1/2] iommu/sva: Tighten SVA bind API with explicit flags

2021-04-14 Thread Jacob Pan

The void* drvdata parameter isn't really used in iommu_sva_bind_device()
API, the current IDXD code "borrows" the drvdata for a VT-d private flag
for supervisor SVA usage.

Supervisor/Privileged mode request is a generic feature. It should be
promoted from the VT-d vendor driver to the generic code.

This patch replaces void* drvdata with a unsigned int flags parameter
and adjusts callers accordingly.

Link: https://lore.kernel.org/linux-iommu/YFhiMLR35WWMW%2FHu@myrica/
Suggested-by: Jean-Philippe Brucker 
Signed-off-by: Jacob Pan 
---
 drivers/dma/idxd/cdev.c   |  2 +-
 drivers/dma/idxd/init.c   |  7 +++
 .../iommu/arm/arm-smmu-v3/arm-smmu-v3-sva.c   |  5 -
 drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.h   |  4 ++--
 drivers/iommu/intel/svm.c | 14 --
 drivers/iommu/iommu.c |  9 ++---
 drivers/misc/uacce/uacce.c|  2 +-
 include/linux/intel-iommu.h   |  2 +-
 include/linux/intel-svm.h | 17 ++---
 include/linux/iommu.h | 19 ---
 10 files changed, 40 insertions(+), 41 deletions(-)

diff --git a/drivers/dma/idxd/cdev.c b/drivers/dma/idxd/cdev.c
index 0db9b82ed8cf..21ec82bc47b6 100644
--- a/drivers/dma/idxd/cdev.c
+++ b/drivers/dma/idxd/cdev.c
@@ -103,7 +103,7 @@ static int idxd_cdev_open(struct inode *inode, struct file 
*filp)
filp->private_data = ctx;
 
if (device_pasid_enabled(idxd)) {
-   sva = iommu_sva_bind_device(dev, current->mm, NULL);
+   sva = iommu_sva_bind_device(dev, current->mm, 0);
if (IS_ERR(sva)) {
rc = PTR_ERR(sva);
dev_err(dev, "pasid allocation failed: %d\n", rc);
diff --git a/drivers/dma/idxd/init.c b/drivers/dma/idxd/init.c
index 085a0c3b62c6..82a0985ad6dc 100644
--- a/drivers/dma/idxd/init.c
+++ b/drivers/dma/idxd/init.c
@@ -14,7 +14,6 @@
 #include 
 #include 
 #include 
-#include 
 #include 
 #include 
 #include 
@@ -300,13 +299,13 @@ static struct idxd_device *idxd_alloc(struct pci_dev 
*pdev)
 
 static int idxd_enable_system_pasid(struct idxd_device *idxd)
 {
-   int flags;
+   unsigned int flags;
unsigned int pasid;
struct iommu_sva *sva;
 
-   flags = SVM_FLAG_SUPERVISOR_MODE;
+   flags = IOMMU_SVA_BIND_SUPERVISOR;
 
-   sva = iommu_sva_bind_device(>pdev->dev, NULL, );
+   sva = iommu_sva_bind_device(>pdev->dev, NULL, flags);
if (IS_ERR(sva)) {
dev_warn(>pdev->dev,
 "iommu sva bind failed: %ld\n", PTR_ERR(sva));
diff --git a/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3-sva.c 
b/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3-sva.c
index bb251cab61f3..145ceb2fc5da 100644
--- a/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3-sva.c
+++ b/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3-sva.c
@@ -354,12 +354,15 @@ __arm_smmu_sva_bind(struct device *dev, struct mm_struct 
*mm)
 }
 
 struct iommu_sva *
-arm_smmu_sva_bind(struct device *dev, struct mm_struct *mm, void *drvdata)
+arm_smmu_sva_bind(struct device *dev, struct mm_struct *mm, unsigned int flags)
 {
struct iommu_sva *handle;
struct iommu_domain *domain = iommu_get_domain_for_dev(dev);
struct arm_smmu_domain *smmu_domain = to_smmu_domain(domain);
 
+   if (flags)
+   return ERR_PTR(-EINVAL);
+
if (smmu_domain->stage != ARM_SMMU_DOMAIN_S1)
return ERR_PTR(-EINVAL);
 
diff --git a/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.h 
b/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.h
index f985817c967a..b971d4dcf090 100644
--- a/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.h
+++ b/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.h
@@ -711,7 +711,7 @@ bool arm_smmu_master_sva_enabled(struct arm_smmu_master 
*master);
 int arm_smmu_master_enable_sva(struct arm_smmu_master *master);
 int arm_smmu_master_disable_sva(struct arm_smmu_master *master);
 struct iommu_sva *arm_smmu_sva_bind(struct device *dev, struct mm_struct *mm,
-   void *drvdata);
+   unsigned int flags);
 void arm_smmu_sva_unbind(struct iommu_sva *handle);
 u32 arm_smmu_sva_get_pasid(struct iommu_sva *handle);
 void arm_smmu_sva_notifier_synchronize(void);
@@ -742,7 +742,7 @@ static inline int arm_smmu_master_disable_sva(struct 
arm_smmu_master *master)
 }
 
 static inline struct iommu_sva *
-arm_smmu_sva_bind(struct device *dev, struct mm_struct *mm, void *drvdata)
+arm_smmu_sva_bind(struct device *dev, struct mm_struct *mm, unsigned int flags)
 {
return ERR_PTR(-ENODEV);
 }
diff --git a/drivers/iommu/intel/svm.c b/drivers/iommu/intel/svm.c
index 574a7e657a9a..d4840821f7b5 100644
--- a/drivers/iommu/intel/svm.c
+++ b/drivers/iommu/intel/svm.c
@@ -486,12 +486,9 @@ intel_svm_bind_mm(struct device *dev, unsigned i

[PATCH v2 0/2] Simplify and restrict IOMMU SVA APIs

2021-04-14 Thread Jacob Pan

A couple of small changes to simplify and restrict SVA APIs. The motivation
is to make PASID allocation palatable for cgroup consumptions. Misc cgroup
is merged for v5.13, it can be extended for IOASID as another scalar
resource.

I have not tested on ARM platforms due to availability. Would appreciate
if someone could help with the testing on ARM.

Thanks,

Jacob

ChangeLog:
V2
- retained mm argument in iommu_sva_alloc_pasid()
- keep generic supervisor flag separated from vt-d's SRE
- move flag declaration out of CONFIG_IOMMU_API


Jacob Pan (2):
  iommu/sva: Tighten SVA bind API with explicit flags
  iommu/sva: Remove mm parameter from SVA bind API

 drivers/dma/idxd/cdev.c   |  2 +-
 drivers/dma/idxd/init.c   |  7 +++---
 .../iommu/arm/arm-smmu-v3/arm-smmu-v3-sva.c   |  5 +++-
 drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.h   |  4 ++--
 drivers/iommu/intel/svm.c | 14 ---
 drivers/iommu/iommu-sva-lib.c | 11 +
 drivers/iommu/iommu.c | 23 +--
 drivers/misc/uacce/uacce.c|  2 +-
 include/linux/intel-iommu.h   |  2 +-
 include/linux/intel-svm.h | 17 ++
 include/linux/iommu.h | 20 
 11 files changed, 57 insertions(+), 50 deletions(-)


base-commit: e49d033bddf5b565044e2abe4241353959bc9120
-- 
2.25.1

Re: [PATCH 2/2] iommu/sva: Remove mm parameter from SVA bind API

2021-04-13 Thread Jacob Pan

Hi Jean,

On Fri, 9 Apr 2021 11:03:05 -0700, Jacob Pan
 wrote:

> > problems:
> > 
> > * We don't have a use-case for binding the mm of a remote process (and
> >   it's supposedly difficult for device drivers to do it securely). So
> > OK, we remove the mm argument from iommu_sva_bind_device() and use the
> >   current mm. But the IOMMU driver isn't going to do
> > get_task_mm(current) every time it needs the mm being bound, it will
> > take it from iommu_sva_bind_device(). Likewise iommu_sva_alloc_pasid()
> > shouldn't need to bother with get_task_mm().
> > 
> > * cgroup accounting for IOASIDs needs to be on the current task.
> > Removing the mm parameter from iommu_sva_alloc_pasid() doesn't help
> > with that. Sure it indicates that iommu_sva_alloc_pasid() needs a
> > specific task context but that's only for cgroup purpose, and I'd
> > rather pass the cgroup down from iommu_sva_bind_device() anyway (but am
> > fine with keeping it within ioasid_alloc() for now). Plus it's an
> > internal helper, easy for us to check that the callers are doing the
> > right thing. 
> With the above split, we really just have one allocation function:
> ioasid_alloc(), so it can manage current cgroup accounting within. Would
> this work?
After a few attempts, I don't think the split can work better. I will
restore the mm parameter and add a warning if mm != current->mm.

Thanks,

Jacob

Re: [PATCH 1/2] iommu/sva: Tighten SVA bind API with explicit flags

2021-04-13 Thread Jacob Pan

Hi Baolu,
Thanks for the view.

On Fri, 9 Apr 2021 20:24:22 +0800, Lu Baolu 
wrote:

> Hi Jacob,
> 
> On 2021/4/9 1:08, Jacob Pan wrote:
> > The void* drvdata parameter isn't really used in iommu_sva_bind_device()
> > API, the current IDXD code "borrows" the drvdata for a VT-d private flag
> > for supervisor SVA usage.
> > 
> > Supervisor/Privileged mode request is a generic feature. It should be
> > promoted from the VT-d vendor driver to the generic code.
> > 
> > This patch replaces void* drvdata with a unsigned int flags parameter
> > and adjusts callers accordingly.
> > 
> > Link: https://lore.kernel.org/linux-iommu/YFhiMLR35WWMW%2FHu@myrica/
> > Suggested-by: Jean-Philippe Brucker 
> > Signed-off-by: Jacob Pan 
> > ---
> >   drivers/dma/idxd/cdev.c |  2 +-
> >   drivers/dma/idxd/init.c |  6 +++---
> >   drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3-sva.c |  2 +-
> >   drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.h |  4 ++--
> >   drivers/iommu/intel/Kconfig |  1 +
> >   drivers/iommu/intel/svm.c   | 18
> > ++ drivers/iommu/iommu.c   |  9
> > ++--- drivers/misc/uacce/uacce.c  |  2 +-
> >   include/linux/intel-iommu.h |  2 +-
> >   include/linux/intel-svm.h   | 17 ++---
> >   include/linux/iommu.h   | 19
> > --- 11 files changed, 40 insertions(+), 42 deletions(-)
> > 
> > diff --git a/drivers/dma/idxd/cdev.c b/drivers/dma/idxd/cdev.c
> > index 0db9b82..21ec82b 100644
> > --- a/drivers/dma/idxd/cdev.c
> > +++ b/drivers/dma/idxd/cdev.c
> > @@ -103,7 +103,7 @@ static int idxd_cdev_open(struct inode *inode,
> > struct file *filp) filp->private_data = ctx;
> >   
> > if (device_pasid_enabled(idxd)) {
> > -   sva = iommu_sva_bind_device(dev, current->mm, NULL);
> > +   sva = iommu_sva_bind_device(dev, current->mm, 0);
> > if (IS_ERR(sva)) {
> > rc = PTR_ERR(sva);
> > dev_err(dev, "pasid allocation failed: %d\n",
> > rc); diff --git a/drivers/dma/idxd/init.c b/drivers/dma/idxd/init.c
> > index 085a0c3..cdc85f1 100644
> > --- a/drivers/dma/idxd/init.c
> > +++ b/drivers/dma/idxd/init.c
> > @@ -300,13 +300,13 @@ static struct idxd_device *idxd_alloc(struct
> > pci_dev *pdev) 
> >   static int idxd_enable_system_pasid(struct idxd_device *idxd)
> >   {
> > -   int flags;
> > +   unsigned int flags;
> > unsigned int pasid;
> > struct iommu_sva *sva;
> >   
> > -   flags = SVM_FLAG_SUPERVISOR_MODE;
> > +   flags = IOMMU_SVA_BIND_SUPERVISOR;  
> 
> With SVM_FLAG_SUPERVISOR_MODE removed, I guess
> 
> #include 
> 
> in this file could be removed as well.
yes, good point.

> 
> >   
> > -   sva = iommu_sva_bind_device(>pdev->dev, NULL, );
> > +   sva = iommu_sva_bind_device(>pdev->dev, NULL, flags);
> > if (IS_ERR(sva)) {
> > dev_warn(>pdev->dev,
> >  "iommu sva bind failed: %ld\n",
> > PTR_ERR(sva)); diff --git
> > a/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3-sva.c
> > b/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3-sva.c index
> > bb251ca..23e287e 100644 ---
> > a/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3-sva.c +++
> > b/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3-sva.c @@ -354,7 +354,7 @@
> > __arm_smmu_sva_bind(struct device *dev, struct mm_struct *mm) } 
> >   struct iommu_sva *
> > -arm_smmu_sva_bind(struct device *dev, struct mm_struct *mm, void
> > *drvdata) +arm_smmu_sva_bind(struct device *dev, struct mm_struct *mm,
> > unsigned int flags) {
> > struct iommu_sva *handle;
> > struct iommu_domain *domain = iommu_get_domain_for_dev(dev);
> > diff --git a/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.h
> > b/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.h index f985817..b971d4d
> > 100644 --- a/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.h
> > +++ b/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.h
> > @@ -711,7 +711,7 @@ bool arm_smmu_master_sva_enabled(struct
> > arm_smmu_master *master); int arm_smmu_master_enable_sva(struct
> > arm_smmu_master *master); int arm_smmu_master_disable_sva(struct
> > arm_smmu_master *master); struct iommu_sva *arm_smmu_sva_bind(struct
> > device *dev, struct mm_struct *mm,
> > -   void *drvdata);
> > +

Re: [PATCH 1/2] iommu/sva: Tighten SVA bind API with explicit flags

2021-04-09 Thread Jacob Pan

Hi Jean-Philippe,

On Fri, 9 Apr 2021 12:22:21 +0200, Jean-Philippe Brucker
 wrote:

> On Thu, Apr 08, 2021 at 10:08:55AM -0700, Jacob Pan wrote:
> > The void* drvdata parameter isn't really used in iommu_sva_bind_device()
> > API,  
> 
> Right, it used to be a cookie passed to the device driver in the exit_mm()
> callback, but that went away with edcc40d2ab5f ("iommu: Remove
> iommu_sva_ops::mm_exit()")
> 
> > the current IDXD code "borrows" the drvdata for a VT-d private flag
> > for supervisor SVA usage.
> > 
> > Supervisor/Privileged mode request is a generic feature. It should be
> > promoted from the VT-d vendor driver to the generic code.
> > 
> > This patch replaces void* drvdata with a unsigned int flags parameter
> > and adjusts callers accordingly.  
> 
> Thanks for cleaning this up. Making flags unsigned long seems more common
> (I suggested int without thinking). But it doesn't matter much, we won't
> get to 32 flags.
> 
I was just thinking unsigned int is 32 bit for both 32 and 64 bit machine.

> > 
> > Link: https://lore.kernel.org/linux-iommu/YFhiMLR35WWMW%2FHu@myrica/
> > Suggested-by: Jean-Philippe Brucker 
> > Signed-off-by: Jacob Pan 
> > ---
> >  drivers/dma/idxd/cdev.c |  2 +-
> >  drivers/dma/idxd/init.c |  6 +++---
> >  drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3-sva.c |  2 +-
> >  drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.h |  4 ++--
> >  drivers/iommu/intel/Kconfig |  1 +
> >  drivers/iommu/intel/svm.c   | 18 ++
> >  drivers/iommu/iommu.c   |  9 ++---
> >  drivers/misc/uacce/uacce.c  |  2 +-
> >  include/linux/intel-iommu.h |  2 +-
> >  include/linux/intel-svm.h   | 17 ++---
> >  include/linux/iommu.h   | 19
> > --- 11 files changed, 40 insertions(+), 42 deletions(-)
> > 
> > diff --git a/drivers/dma/idxd/cdev.c b/drivers/dma/idxd/cdev.c
> > index 0db9b82..21ec82b 100644
> > --- a/drivers/dma/idxd/cdev.c
> > +++ b/drivers/dma/idxd/cdev.c
> > @@ -103,7 +103,7 @@ static int idxd_cdev_open(struct inode *inode,
> > struct file *filp) filp->private_data = ctx;
> >  
> > if (device_pasid_enabled(idxd)) {
> > -   sva = iommu_sva_bind_device(dev, current->mm, NULL);
> > +   sva = iommu_sva_bind_device(dev, current->mm, 0);
> > if (IS_ERR(sva)) {
> > rc = PTR_ERR(sva);
> > dev_err(dev, "pasid allocation failed: %d\n",
> > rc); diff --git a/drivers/dma/idxd/init.c b/drivers/dma/idxd/init.c
> > index 085a0c3..cdc85f1 100644
> > --- a/drivers/dma/idxd/init.c
> > +++ b/drivers/dma/idxd/init.c
> > @@ -300,13 +300,13 @@ static struct idxd_device *idxd_alloc(struct
> > pci_dev *pdev) 
> >  static int idxd_enable_system_pasid(struct idxd_device *idxd)
> >  {
> > -   int flags;
> > +   unsigned int flags;
> > unsigned int pasid;
> > struct iommu_sva *sva;
> >  
> > -   flags = SVM_FLAG_SUPERVISOR_MODE;
> > +   flags = IOMMU_SVA_BIND_SUPERVISOR;
> >  
> > -   sva = iommu_sva_bind_device(>pdev->dev, NULL, );
> > +   sva = iommu_sva_bind_device(>pdev->dev, NULL, flags);
> > if (IS_ERR(sva)) {
> > dev_warn(>pdev->dev,
> >  "iommu sva bind failed: %ld\n", PTR_ERR(sva));
> > diff --git a/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3-sva.c
> > b/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3-sva.c index
> > bb251ca..23e287e 100644 ---
> > a/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3-sva.c +++
> > b/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3-sva.c @@ -354,7 +354,7 @@
> > __arm_smmu_sva_bind(struct device *dev, struct mm_struct *mm) }
> >  
> >  struct iommu_sva *
> > -arm_smmu_sva_bind(struct device *dev, struct mm_struct *mm, void
> > *drvdata) +arm_smmu_sva_bind(struct device *dev, struct mm_struct *mm,
> > unsigned int flags)  
> 
> Could you add a check on flags:
> 
> diff --git a/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3-sva.c
> b/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3-sva.c index
> bb251cab61f3..145ceb2fc5da 100644 ---
> a/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3-sva.c +++
> b/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3-sva.c @@ -354,12 +354,15 @@
> __arm_smmu_sva_bind(struct device *dev, struct mm_struct *mm) }
> 
>  struct iommu_sva *
> -arm_smmu_sva_bind(struct device *dev, struct mm_st

Re: [PATCH 2/2] iommu/sva: Remove mm parameter from SVA bind API

2021-04-09 Thread Jacob Pan

Hi Lu,

On Fri, 9 Apr 2021 20:45:22 +0800, Lu Baolu 
wrote:

> > -int iommu_sva_alloc_pasid(struct mm_struct *mm, ioasid_t min, ioasid_t
> > max) +int iommu_sva_alloc_pasid(ioasid_t min, ioasid_t max)
> >   {
> > int ret = 0;
> > ioasid_t pasid;
> > +   struct mm_struct *mm;
> >   
> > if (min == INVALID_IOASID || max == INVALID_IOASID ||
> > min == 0 || max < min)
> > return -EINVAL;
> >   
> > mutex_lock(_sva_lock);
> > +   mm = get_task_mm(current);  
> 
> How could we allocate a supervisor PASID through iommu_sva_alloc_pasid()
> if we always use current->mm here?
I don't think you can. But I guess the current callers of this function do
not need supervisor PASID.
In reply to Jean, I suggest we split this function into mm->pasid
assignment and keep using ioasid_alloc() directly, then supervisor PASID is
caller's bind choice.

Thanks,

Jacob

Re: [PATCH 2/2] iommu/sva: Remove mm parameter from SVA bind API

2021-04-09 Thread Jacob Pan

Hi Jean-Philippe,

On Fri, 9 Apr 2021 12:11:47 +0200, Jean-Philippe Brucker
 wrote:

> On Thu, Apr 08, 2021 at 10:08:56AM -0700, Jacob Pan wrote:
> > diff --git a/drivers/iommu/iommu-sva-lib.c
> > b/drivers/iommu/iommu-sva-lib.c index bd41405..bd99f6b 100644
> > --- a/drivers/iommu/iommu-sva-lib.c
> > +++ b/drivers/iommu/iommu-sva-lib.c
> > @@ -12,27 +12,33 @@ static DECLARE_IOASID_SET(iommu_sva_pasid);
> >  
> >  /**
> >   * iommu_sva_alloc_pasid - Allocate a PASID for the mm
> > - * @mm: the mm
> >   * @min: minimum PASID value (inclusive)
> >   * @max: maximum PASID value (inclusive)
> >   *
> > - * Try to allocate a PASID for this mm, or take a reference to the
> > existing one
> > - * provided it fits within the [@min, @max] range. On success the
> > PASID is
> > - * available in mm->pasid, and must be released with
> > iommu_sva_free_pasid().
> > + * Try to allocate a PASID for the current mm, or take a reference to
> > the
> > + * existing one provided it fits within the [@min, @max] range. On
> > success
> > + * the PASID is available in the current mm->pasid, and must be
> > released with
> > + * iommu_sva_free_pasid().
> >   * @min must be greater than 0, because 0 indicates an unused
> > mm->pasid. *
> >   * Returns 0 on success and < 0 on error.
> >   */
> > -int iommu_sva_alloc_pasid(struct mm_struct *mm, ioasid_t min, ioasid_t
> > max) +int iommu_sva_alloc_pasid(ioasid_t min, ioasid_t max)
> >  {
> > int ret = 0;
> > ioasid_t pasid;
> > +   struct mm_struct *mm;
> >  
> > if (min == INVALID_IOASID || max == INVALID_IOASID ||
> > min == 0 || max < min)
> > return -EINVAL;
> >  
> > mutex_lock(_sva_lock);
> > +   mm = get_task_mm(current);
> > +   if (!mm) {
> > +   ret = -EINVAL;
> > +   goto out_unlock;
> > +   }  
> 
> I still think it would be more elegant to keep the choice of context in
> iommu_sva_bind_device() and pass it down to leaf functions such as
> iommu_sva_alloc_pasid(). The patch is trying to solve two separate

I agree if iommu_sva_alloc_pasid() is a leaf function, but it is a public
function, e.g. called by smmu code:
/* Allocate a PASID for this mm if necessary */
ret = iommu_sva_alloc_pasid(1, (1U << master->ssid_bits) - 1);
If we give mm as parameter, it will give callers the illusion that this
mm doesn't have to be current->mm.

Should we make it into a leaf function by splitting iommu_sva_alloc_pasid()
into two parts?
1. iommu_sva_assign_pasid() //a new leaf helper function does mm->pasid
assignment
2. ioasid_alloc()

in iommu_sva_bind_device(), we do:
1. handle = driver ops->sva_bind(dev, mm, flags);
2. pasid = sva_get_pasid(handle);
3. iommu_sva_assign_pasid(mm, pasid)

In vendor driver sva_bind(), it just use ioasid_alloc directly with custom
range. e.g. arm-smmu-v3-sva.c
- ret = iommu_sva_alloc_pasid(1, (1U << master->ssid_bits) - 1);
+ ret = ioasid_alloc(_sva_pasid, 1, (1U << master->ssid_bits);
   
> problems:
> 
> * We don't have a use-case for binding the mm of a remote process (and
>   it's supposedly difficult for device drivers to do it securely). So OK,
>   we remove the mm argument from iommu_sva_bind_device() and use the
>   current mm. But the IOMMU driver isn't going to do get_task_mm(current)
>   every time it needs the mm being bound, it will take it from
>   iommu_sva_bind_device(). Likewise iommu_sva_alloc_pasid() shouldn't need
>   to bother with get_task_mm().
> 
> * cgroup accounting for IOASIDs needs to be on the current task. Removing
>   the mm parameter from iommu_sva_alloc_pasid() doesn't help with that.
>   Sure it indicates that iommu_sva_alloc_pasid() needs a specific task
>   context but that's only for cgroup purpose, and I'd rather pass the
>   cgroup down from iommu_sva_bind_device() anyway (but am fine with
>   keeping it within ioasid_alloc() for now). Plus it's an internal helper,
>   easy for us to check that the callers are doing the right thing.
> 
With the above split, we really just have one allocation function:
ioasid_alloc(), so it can manage current cgroup accounting within. Would
this work?

> > if (mm->pasid) {
> > if (mm->pasid >= min && mm->pasid <= max)
> > ioasid_get(mm->pasid);
> > @@ -45,22 +51,32 @@ int iommu_sva_alloc_pasid(struct mm_struct *mm,
> > ioasid_t min, ioasid_t max) else
> > mm->pasid = pasid;
> > }
> > +   mmput(mm);
> > +out_unlock:
> > mutex_unlock(_sva_lo

[PATCH 2/2] iommu/sva: Remove mm parameter from SVA bind API

2021-04-08 Thread Jacob Pan

The mm parameter in iommu_sva_bind_device() is intended for privileged
process perform bind() on behalf of other processes. This use case has
yet to be materialized, let alone potential security implications of
adding kernel hooks without explicit user consent.
In addition, with the agreement that IOASID allocation shall be subject
cgroup limit. It will be inline with misc cgroup proposal if IOASID
allocation as part of the SVA bind is limited to the current task.

Link: https://lore.kernel.org/linux-iommu/20210303160205.151d114e@jacob-builder/
Link: https://lore.kernel.org/linux-iommu/YFhiMLR35WWMW%2FHu@myrica/
Signed-off-by: Jacob Pan 
---
 drivers/dma/idxd/cdev.c |  2 +-
 drivers/dma/idxd/init.c |  2 +-
 drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3-sva.c |  6 ++---
 drivers/iommu/iommu-sva-lib.c   | 30 +++--
 drivers/iommu/iommu-sva-lib.h   |  4 ++--
 drivers/iommu/iommu.c   | 16 -
 drivers/misc/uacce/uacce.c  |  2 +-
 include/linux/iommu.h   |  7 +++---
 8 files changed, 45 insertions(+), 24 deletions(-)

diff --git a/drivers/dma/idxd/cdev.c b/drivers/dma/idxd/cdev.c
index 21ec82b..8c3347c 100644
--- a/drivers/dma/idxd/cdev.c
+++ b/drivers/dma/idxd/cdev.c
@@ -103,7 +103,7 @@ static int idxd_cdev_open(struct inode *inode, struct file 
*filp)
filp->private_data = ctx;
 
if (device_pasid_enabled(idxd)) {
-   sva = iommu_sva_bind_device(dev, current->mm, 0);
+   sva = iommu_sva_bind_device(dev, 0);
if (IS_ERR(sva)) {
rc = PTR_ERR(sva);
dev_err(dev, "pasid allocation failed: %d\n", rc);
diff --git a/drivers/dma/idxd/init.c b/drivers/dma/idxd/init.c
index cdc85f1..a583f79 100644
--- a/drivers/dma/idxd/init.c
+++ b/drivers/dma/idxd/init.c
@@ -306,7 +306,7 @@ static int idxd_enable_system_pasid(struct idxd_device 
*idxd)
 
flags = IOMMU_SVA_BIND_SUPERVISOR;
 
-   sva = iommu_sva_bind_device(>pdev->dev, NULL, flags);
+   sva = iommu_sva_bind_device(>pdev->dev, flags);
if (IS_ERR(sva)) {
dev_warn(>pdev->dev,
 "iommu sva bind failed: %ld\n", PTR_ERR(sva));
diff --git a/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3-sva.c 
b/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3-sva.c
index 23e287e..bdd5c79 100644
--- a/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3-sva.c
+++ b/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3-sva.c
@@ -329,7 +329,7 @@ __arm_smmu_sva_bind(struct device *dev, struct mm_struct 
*mm)
return ERR_PTR(-ENOMEM);
 
/* Allocate a PASID for this mm if necessary */
-   ret = iommu_sva_alloc_pasid(mm, 1, (1U << master->ssid_bits) - 1);
+   ret = iommu_sva_alloc_pasid(1, (1U << master->ssid_bits) - 1);
if (ret)
goto err_free_bond;
 
@@ -347,7 +347,7 @@ __arm_smmu_sva_bind(struct device *dev, struct mm_struct 
*mm)
return >sva;
 
 err_free_pasid:
-   iommu_sva_free_pasid(mm);
+   iommu_sva_free_pasid();
 err_free_bond:
kfree(bond);
return ERR_PTR(ret);
@@ -377,7 +377,7 @@ void arm_smmu_sva_unbind(struct iommu_sva *handle)
if (refcount_dec_and_test(>refs)) {
list_del(>list);
arm_smmu_mmu_notifier_put(bond->smmu_mn);
-   iommu_sva_free_pasid(bond->mm);
+   iommu_sva_free_pasid();
kfree(bond);
}
mutex_unlock(_lock);
diff --git a/drivers/iommu/iommu-sva-lib.c b/drivers/iommu/iommu-sva-lib.c
index bd41405..bd99f6b 100644
--- a/drivers/iommu/iommu-sva-lib.c
+++ b/drivers/iommu/iommu-sva-lib.c
@@ -12,27 +12,33 @@ static DECLARE_IOASID_SET(iommu_sva_pasid);
 
 /**
  * iommu_sva_alloc_pasid - Allocate a PASID for the mm
- * @mm: the mm
  * @min: minimum PASID value (inclusive)
  * @max: maximum PASID value (inclusive)
  *
- * Try to allocate a PASID for this mm, or take a reference to the existing one
- * provided it fits within the [@min, @max] range. On success the PASID is
- * available in mm->pasid, and must be released with iommu_sva_free_pasid().
+ * Try to allocate a PASID for the current mm, or take a reference to the
+ * existing one provided it fits within the [@min, @max] range. On success
+ * the PASID is available in the current mm->pasid, and must be released with
+ * iommu_sva_free_pasid().
  * @min must be greater than 0, because 0 indicates an unused mm->pasid.
  *
  * Returns 0 on success and < 0 on error.
  */
-int iommu_sva_alloc_pasid(struct mm_struct *mm, ioasid_t min, ioasid_t max)
+int iommu_sva_alloc_pasid(ioasid_t min, ioasid_t max)
 {
int ret = 0;
ioasid_t pasid;
+   struct mm_struct *mm;
 
if (min == INVALID_IOASID || max == INVALID_IOASID ||
min == 0

[PATCH 1/2] iommu/sva: Tighten SVA bind API with explicit flags

2021-04-08 Thread Jacob Pan

The void* drvdata parameter isn't really used in iommu_sva_bind_device()
API, the current IDXD code "borrows" the drvdata for a VT-d private flag
for supervisor SVA usage.

Supervisor/Privileged mode request is a generic feature. It should be
promoted from the VT-d vendor driver to the generic code.

This patch replaces void* drvdata with a unsigned int flags parameter
and adjusts callers accordingly.

Link: https://lore.kernel.org/linux-iommu/YFhiMLR35WWMW%2FHu@myrica/
Suggested-by: Jean-Philippe Brucker 
Signed-off-by: Jacob Pan 
---
 drivers/dma/idxd/cdev.c |  2 +-
 drivers/dma/idxd/init.c |  6 +++---
 drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3-sva.c |  2 +-
 drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.h |  4 ++--
 drivers/iommu/intel/Kconfig |  1 +
 drivers/iommu/intel/svm.c   | 18 ++
 drivers/iommu/iommu.c   |  9 ++---
 drivers/misc/uacce/uacce.c  |  2 +-
 include/linux/intel-iommu.h |  2 +-
 include/linux/intel-svm.h   | 17 ++---
 include/linux/iommu.h   | 19 ---
 11 files changed, 40 insertions(+), 42 deletions(-)

diff --git a/drivers/dma/idxd/cdev.c b/drivers/dma/idxd/cdev.c
index 0db9b82..21ec82b 100644
--- a/drivers/dma/idxd/cdev.c
+++ b/drivers/dma/idxd/cdev.c
@@ -103,7 +103,7 @@ static int idxd_cdev_open(struct inode *inode, struct file 
*filp)
filp->private_data = ctx;
 
if (device_pasid_enabled(idxd)) {
-   sva = iommu_sva_bind_device(dev, current->mm, NULL);
+   sva = iommu_sva_bind_device(dev, current->mm, 0);
if (IS_ERR(sva)) {
rc = PTR_ERR(sva);
dev_err(dev, "pasid allocation failed: %d\n", rc);
diff --git a/drivers/dma/idxd/init.c b/drivers/dma/idxd/init.c
index 085a0c3..cdc85f1 100644
--- a/drivers/dma/idxd/init.c
+++ b/drivers/dma/idxd/init.c
@@ -300,13 +300,13 @@ static struct idxd_device *idxd_alloc(struct pci_dev 
*pdev)
 
 static int idxd_enable_system_pasid(struct idxd_device *idxd)
 {
-   int flags;
+   unsigned int flags;
unsigned int pasid;
struct iommu_sva *sva;
 
-   flags = SVM_FLAG_SUPERVISOR_MODE;
+   flags = IOMMU_SVA_BIND_SUPERVISOR;
 
-   sva = iommu_sva_bind_device(>pdev->dev, NULL, );
+   sva = iommu_sva_bind_device(>pdev->dev, NULL, flags);
if (IS_ERR(sva)) {
dev_warn(>pdev->dev,
 "iommu sva bind failed: %ld\n", PTR_ERR(sva));
diff --git a/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3-sva.c 
b/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3-sva.c
index bb251ca..23e287e 100644
--- a/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3-sva.c
+++ b/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3-sva.c
@@ -354,7 +354,7 @@ __arm_smmu_sva_bind(struct device *dev, struct mm_struct 
*mm)
 }
 
 struct iommu_sva *
-arm_smmu_sva_bind(struct device *dev, struct mm_struct *mm, void *drvdata)
+arm_smmu_sva_bind(struct device *dev, struct mm_struct *mm, unsigned int flags)
 {
struct iommu_sva *handle;
struct iommu_domain *domain = iommu_get_domain_for_dev(dev);
diff --git a/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.h 
b/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.h
index f985817..b971d4d 100644
--- a/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.h
+++ b/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.h
@@ -711,7 +711,7 @@ bool arm_smmu_master_sva_enabled(struct arm_smmu_master 
*master);
 int arm_smmu_master_enable_sva(struct arm_smmu_master *master);
 int arm_smmu_master_disable_sva(struct arm_smmu_master *master);
 struct iommu_sva *arm_smmu_sva_bind(struct device *dev, struct mm_struct *mm,
-   void *drvdata);
+   unsigned int flags);
 void arm_smmu_sva_unbind(struct iommu_sva *handle);
 u32 arm_smmu_sva_get_pasid(struct iommu_sva *handle);
 void arm_smmu_sva_notifier_synchronize(void);
@@ -742,7 +742,7 @@ static inline int arm_smmu_master_disable_sva(struct 
arm_smmu_master *master)
 }
 
 static inline struct iommu_sva *
-arm_smmu_sva_bind(struct device *dev, struct mm_struct *mm, void *drvdata)
+arm_smmu_sva_bind(struct device *dev, struct mm_struct *mm, unsigned int flags)
 {
return ERR_PTR(-ENODEV);
 }
diff --git a/drivers/iommu/intel/Kconfig b/drivers/iommu/intel/Kconfig
index 28a3d15..5415052 100644
--- a/drivers/iommu/intel/Kconfig
+++ b/drivers/iommu/intel/Kconfig
@@ -41,6 +41,7 @@ config INTEL_IOMMU_SVM
select PCI_PRI
select MMU_NOTIFIER
select IOASID
+   select IOMMU_SVA_LIB
help
  Shared Virtual Memory (SVM) provides a facility for devices
  to access DMA resources through process address space by
diff --git a/drivers/iommu/intel/svm.c b/drivers/iommu/intel/svm.c
index 574a7e6..4b5

Re: [PATCH V4 05/18] iommu/ioasid: Redefine IOASID set and allocation APIs

2021-04-01 Thread Jacob Pan

Hi Jason,

On Wed, 31 Mar 2021 21:37:05 -0300, Jason Gunthorpe  wrote:

> On Wed, Mar 31, 2021 at 04:46:21PM -0700, Jacob Pan wrote:
> > Hi Jason,
> > 
> > On Wed, 31 Mar 2021 09:38:01 -0300, Jason Gunthorpe 
> > wrote: 
> > > > > Get rid of the ioasid set.
> > > > >
> > > > > Each driver has its own list of allowed ioasids.
> > >  [...]  
> > > 
> > > The /dev/ioasid FD replaces this security check. By becoming FD
> > > centric you don't need additional kernel security objects.
> > > 
> > > Any process with access to the /dev/ioasid FD is allowed to control
> > > those PASID. The seperation between VMs falls naturally from the
> > > seperation of FDs without creating additional, complicated, security
> > > infrastrucure in the kernel.
> > > 
> > > This is why all APIs must be FD focused, and you need to have a
> > > logical layering of responsibility.
> > > 
> > >  Allocate a /dev/ioasid FD
> > >  Allocate PASIDs inside the FD
Just to be super clear. Do we allocate a FD for each PASID and return the
FD to the user? Or return the plain PASID number back to the user space?

> > >  Assign memory to the PASIDS
> > > 
> > >  Open a device FD, eg from VFIO or VDP
> > >  Instruct the device FD to authorize the device to access PASID A in
> > >  an ioasid FD  
> > How do we know user provided PASID A was allocated by the ioasid FD?  
> 
> You pass in the ioasid FD and use a 'get pasid from fdno' API to
> extract the required kernel structure.
> 
Seems you are talking about two FDs:
- /dev/ioasid FD
- per IOASID FD
This API ioasid = get_pasid_from_fd(dev_ioasid_fd, ioasid_fd);
dev_ioasid_fd will find the xarray for all the PASIDs allocated under it,
ioasid_fd wil be the index into the xarray to retrieve the actual ioasid.
Correct?

> > Shouldn't we validate user input by tracking which PASIDs are
> > allocated by which ioasid FD?  
> 
> Yes, but it is integral to the ioasid FD, not something separated.
> 
OK, if we have per IOASID FD in addition to the /dev/ioasid FD we can
validate user input.

> > > VFIO extracts some kernel representation of the ioasid from the ioasid
> > > fd using an API
> > >   
> > This lookup API seems to be asking for per ioasid FD storage array.
> > Today, the ioasid_set is per mm and contains a Xarray.   
> 
> Right, put the xarray per FD. A set per mm is fairly nonsensical, we
> don't use the mm as that kind of security key.
> 
Sounds good, one xarray per /dev/ioasid FD.

> > Since each VM, KVM can only open one ioasid FD, this per FD array
> > would be equivalent to the per mm ioasid_set, right?  
> 
> Why only one?  Each interaction with the other FDs should include the
> PASID/FD pair. There is no restriction to just one.
> 
OK, one per subsystem-VM. For example, if a VM has a VFIO and a VDPA
device, it should only two /dev/ioasid FDs respectively. Correct?

> > > VFIO does some kernel call to IOMMU/IOASID layer that says 'tell the
> > > IOMMU that this PCI device is allowed to use this PASID'  
> >
> > Would it be redundant to what iommu_uapi_sva_bind_gpasid() does? I
> > thought the idea is to use ioasid FD IOCTL to issue IOMMU uAPI calls.
> > Or we can skip this step for now and wait for the user to do SVA bind.  
> 
> I'm not sure what you are asking.
> 
> Possibly some of the IOMMU API will need a bit adjusting to make
> things split.
> 
> The act of programming the page tables and the act of authorizing a
> PCI BDF to use a PASID are distinct things with two different IOCTLs.
> 
Why separate? I don't see a use case to just authorize a PASID but don't
bind it with a page table. The very act of bind page table *is* the
authorization.

> iommu_uapi_sva_bind_gpasid() is never called by anything, and it's
> uAPI is never implemented.
> 
Just a little background here. We have been working on the vSVA stack
since 2017. At the time, VFIO was the de facto interface for IOMMU-aware
driver framework. These uAPIs were always developed alone side with the
accompanying VFIO patches served as consumers. By the time these IOMMU uAPIs
were merged after reviews from most vendors, the VFIO patchset was
approaching maturity in around v7. This is when we suddenly saw a new
request to support VDPA, which attempted VFIO earlier but ultimately moved
away.

For a complex stack like vSVA, I feel we have to reduce moving parts and do
some divide and conquer.

> Joerg? Why did you merge dead uapi and dead code?
> 
> Jason


Thanks,

Jacob

Re: [PATCH V4 05/18] iommu/ioasid: Redefine IOASID set and allocation APIs

2021-03-31 Thread Jacob Pan

Hi Jason,

On Wed, 31 Mar 2021 09:38:01 -0300, Jason Gunthorpe  wrote:

> > > Get rid of the ioasid set.
> > >
> > > Each driver has its own list of allowed ioasids.  
>  [...]  
> 
> The /dev/ioasid FD replaces this security check. By becoming FD
> centric you don't need additional kernel security objects.
> 
> Any process with access to the /dev/ioasid FD is allowed to control
> those PASID. The seperation between VMs falls naturally from the
> seperation of FDs without creating additional, complicated, security
> infrastrucure in the kernel.
> 
> This is why all APIs must be FD focused, and you need to have a
> logical layering of responsibility.
> 
>  Allocate a /dev/ioasid FD
>  Allocate PASIDs inside the FD
>  Assign memory to the PASIDS
> 
>  Open a device FD, eg from VFIO or VDP
>  Instruct the device FD to authorize the device to access PASID A in
>  an ioasid FD
How do we know user provided PASID A was allocated by the ioasid FD?
Shouldn't we validate user input by tracking which PASIDs are allocated by
which ioasid FD? This is one reason why we have ioasid_set and its xarray.

>* Prior to being authorized the device will have NO access to any
>  PASID
>* Presenting BOTH the device FD and the ioasid FD to the kernel
>  is the security check. Any process with both FDs is allowed to
>  make the connection. This is normal Unix FD centric thinking.
> 
> > > Register a ioasid in the driver's list by passing the fd and ioasid #
> > >  
> > 
> > The fd here is a device fd. Am I right?   
> 
> It would be the vfio_device FD, for instance, and a VFIO IOCTL.
> 
> > If yes, your idea is ioasid is allocated via /dev/ioasid and
> > associated with device fd via either VFIO or vDPA ioctl. right?
> > sorry I may be asking silly questions but really need to ensure we
> > are talking in the same page.  
> 
> Yes, this is right
> 
> > > No listening to events. A simple understandable security model.  
> > 
> > For this suggestion, I have a little bit concern if we may have A-B/B-A
> > lock sequence issue since it requires the /dev/ioasid (if it supports)
> > to call back into VFIO/VDPA to check if the ioasid has been registered
> > to device FD and record it in the per-device list. right? Let's have
> > more discussion based on the skeleton sent by Kevin.  
> 
> Callbacks would be backwards.
> 
> User calls vfio with vfio_device fd and dev/ioasid fd
> 
> VFIO extracts some kernel representation of the ioasid from the ioasid
> fd using an API
> 
This lookup API seems to be asking for per ioasid FD storage array. Today,
the ioasid_set is per mm and contains a Xarray. Since each VM, KVM can only
open one ioasid FD, this per FD array would be equivalent to the per mm
ioasid_set, right?

> VFIO does some kernel call to IOMMU/IOASID layer that says 'tell the
> IOMMU that this PCI device is allowed to use this PASID'
> 
Would it be redundant to what iommu_uapi_sva_bind_gpasid() does? I thought
the idea is to use ioasid FD IOCTL to issue IOMMU uAPI calls. Or we can
skip this step for now and wait for the user to do SVA bind.

> VFIO mdev drivers then record that the PASID is allowed in its own
> device specific struct for later checking during other system calls.


Thanks,

Jacob

Re: [PATCH V4 05/18] iommu/ioasid: Redefine IOASID set and allocation APIs

2021-03-31 Thread Jacob Pan

Hi Jason,

On Wed, 31 Mar 2021 15:33:24 -0300, Jason Gunthorpe  wrote:

> On Wed, Mar 31, 2021 at 11:20:30AM -0700, Jacob Pan wrote:
> > Hi Jason,
> > 
> > On Wed, 31 Mar 2021 14:31:48 -0300, Jason Gunthorpe 
> > wrote: 
> > > > > We should try to avoid hidden behind the scenes kernel
> > > > > interconnections between subsystems.
> > > > > 
>  [...]  
>  [...]  
> > yes, this is done in this patchset.
> >   
>  [...]  
> > Just to clarify, you are saying (when FREE happens before proper
> > teardown) there is no need to proactively notify all users of the
> > IOASID to drop their reference. Instead, just wait for the other
> > parties to naturally close and drop their references. Am I
> > understanding you correctly?  
> 
> Yes. What are receivers going to do when you notify them anyhow? What
> will a mdev do? This is how you get into they crazy locking problems.
> 
The receivers perform cleanup work similar to normal unbind. Drain/Abort
PASID. Locking is an issue in that the atomic notifier is under IOASID
spinlock, so I provided a common ordered workqueue to let mdev drivers
queue cleanup work that cannot be done in atomic context. Not ideal. Also
need to prevent nested notifications for certain cases.

> It is an error for userspace to shutdown like this, recover sensibly
> and don't crash the kernel. PCIe error TLPs are expected, supress
> them. That is what we decided on the mmu notifier discussion.
> 
> > I feel having the notifications can add two values:
> > 1. Shorten the duration of errors (as you mentioned below), FD close can
> > take a long and unpredictable time. e.g. FD shared.  
> 
> Only if userspace exits in some uncontrolled way. In a controlled exit
> it can close all the FDs in the right order.
> 
> It is OK if userspace does something weird and ends up with disabled
> IOASIDs. It shouldn't do that if it cares.
> 
Agreed.

> > 2. Provide teardown ordering among PASID users. i.e. vCPU, IOMMU, mdev.
> >  
> 
> This is a hard ask too, there is no natural ordering here I can see,
> obviously we want vcpu, mdev, iommu for qemu but that doesn't seem to
> fall out unless we explicitly hard wire it into the kernel.
> 
The ordering problem as I understood is that it is difficult for KVM to
rendezvous all vCPUs before updating PASID translation table. So there
could be in-flight enqcmd with the stale PASID after the PASID table update
and refcount drop.

If KVM is the last one to drop the PASID refcount, the PASID could be
immediately reused and starts a new life. The in-flight enqcmd with the
stale PASID could cause problems. The likelihood and window is very small.

If we ensure KVM does PASID table update before IOMMU and mdev driver, the
stale PASID in the in-flight enqcmd would be be drained before starting
a new life.

Perhaps Yi and Kevin can explain this better.

> Doesn't kvm always kill the vCPU first based on the mmu notifier
> shooting down all the memory? IIRC this happens before FD close?
> 
I don't know the answer, Kevin & Yi?

> > > The duration between unmapping the ioasid and releasing all HW access
> > > will have HW see PCIE TLP errors due to the blocked access. If
> > > userspace messes up the order it is fine to cause this. We already had
> > > this dicussion when talking about how to deal with process exit in the
> > > simple SVA case.  
> > Yes, we have disabled fault reporting during this period. The slight
> > differences vs. the simple SVA case is that KVM is also involved and
> > there might be an ordering requirement to stop vCPU first.  
> 
> KVM can continue to use the PASIDs, they are parked and DMA is
> permanently blocked. When KVM reaches a natural point in its teardown
> it can release them.
> 
> If you have to stop the vcpu from a iommu notifier you are in the
> crazy locking world I mentioned. IMHO don't create exciting locking
> problems just to avoid PCI errors in uncontrolled shutdown.
> 
> Suppress the errors instead.
> 
I agree, this simplify things a lot. Just need to clarify the in-flight
enqcmd case.

> Jason


Thanks,

Jacob

Re: [PATCH V4 05/18] iommu/ioasid: Redefine IOASID set and allocation APIs

2021-03-31 Thread Jacob Pan

Hi Jason,

On Wed, 31 Mar 2021 14:31:48 -0300, Jason Gunthorpe  wrote:

> > > We should try to avoid hidden behind the scenes kernel
> > > interconnections between subsystems.
> > >   
> > Can we? in case of exception. Since all these IOCTLs are coming from the
> > unreliable user space, we must deal all exceptions.
> >
> > For example, when user closes /dev/ioasid FD before (or w/o) unbind
> > IOCTL for VFIO, KVM, kernel must do cleanup and coordinate among
> > subsystems. In this patchset, we have a per mm(ioasid_set) notifier to
> > inform mdev, KVM to clean up and drop its refcount. Do you have any
> > suggestion on this?  
> 
> The ioasid should be a reference counted object.
> 
yes, this is done in this patchset.

> When the FD is closed, or the ioasid is "destroyed" it just blocks DMA
> and parks the PASID until *all* places release it. Upon a zero
> refcount the PASID is recycled for future use.
> 
Just to clarify, you are saying (when FREE happens before proper
teardown) there is no need to proactively notify all users of the IOASID to
drop their reference. Instead, just wait for the other parties to naturally
close and drop their references. Am I understanding you correctly?

I feel having the notifications can add two values:
1. Shorten the duration of errors (as you mentioned below), FD close can
take a long and unpredictable time. e.g. FD shared.
2. Provide teardown ordering among PASID users. i.e. vCPU, IOMMU, mdev.

> The duration between unmapping the ioasid and releasing all HW access
> will have HW see PCIE TLP errors due to the blocked access. If
> userspace messes up the order it is fine to cause this. We already had
> this dicussion when talking about how to deal with process exit in the
> simple SVA case.
Yes, we have disabled fault reporting during this period. The slight
differences vs. the simple SVA case is that KVM is also involved and there
might be an ordering requirement to stop vCPU first.

Thanks,

Jacob

Re: [PATCH V4 05/18] iommu/ioasid: Redefine IOASID set and allocation APIs

2021-03-31 Thread Jacob Pan

Hi Jason,

On Wed, 31 Mar 2021 09:28:05 -0300, Jason Gunthorpe  wrote:

> On Tue, Mar 30, 2021 at 05:10:41PM -0700, Jacob Pan wrote:
>  [...]  
>  [...]  
>  [...]  
> > This requires the mdev driver to obtain a list of allowed
> > PASIDs(possibly during PASID bind time) prior to do enforcement. IMHO,
> > the PASID enforcement points are:
> > 1. During WQ configuration (e.g.program MSI)
> > 2. During work submission
> > 
> > For VT-d shared workqueue, there is no way to enforce #2 in mdev driver
> > in that the PASID is obtained from PASID MSR from the CPU and submitted
> > w/o driver involvement.  
> 
> I assume that the PASID MSR is privileged and only qemu can program
> it? Otherwise this seems like a security problem.
> 
yes.

> If qemu controls it then the idxd userspace driver in qemu must ensure
> it is only ever programmed to an authorized PASID.
> 
it is ensured for #1.

> > The enforcement for #2 is in the KVM PASID translation table, which
> > is per VM.  
> 
> I don't understand why KVM gets involved in PASID??
> 
Here is an excerpt from the SIOV spec.
https://software.intel.com/content/www/us/en/develop/download/intel-scalable-io-virtualization-technical-specification.html

"3.3 PASID translation
To support PASID isolation for Shared Work Queues used by VMs, the CPU must
provide a way for the PASID to be communicated to the device in the DMWr
transaction. On Intel CPUs, the CPU provides a PASID translation table in
the vCPUs virtual machine control structures. During ENQCMD/ENQCMDS
instruction execution in a VM, the PASID translation table is used by the
CPU to replace the guest PASID in the work descriptor with a host PASID
before the descriptor is sent to the device.3.3 PASID translation"

> Doesn't work submission go either to the mdev driver or through the
> secure PASID of #1?
> 
No, once a PASID is bound with IOMMU, KVM, and the mdev, work submission is
all done in HW.
But I don't think this will change for either uAPI design.

> > For our current VFIO mdev model, bind guest page table does not involve
> > mdev driver. So this is a gap we must fill, i.e. include a callback from
> > mdev driver?  
> 
> No not a callback, tell the mdev driver with a VFIO IOCTL that it is
> authorized to use a specific PASID because the vIOMMU was told to
> allow it by the guest kernel. Simple and straightforward.
> 
Make sense.

> > > ioasid_set doesn't seem to help at all, certainly not as a concept
> > > tied to /dev/ioasid.
> > >   
> > Yes, we can take the security role off ioasid_set once we have per mdev
> > list. However, ioasid_set being a per VM/mm entity also bridge
> > communications among kernel subsystems that don't have direct call path.
> > e.g. KVM, VDCM and IOMMU.  
> 
> Everything should revolve around the /dev/ioasid FD. qemu should pass
> it to all places that need to know about PASID's in the VM.
> 
I guess we need to extend KVM interface to support PASIDs. Our original
intention was to avoid introducing new interfaces.

> We should try to avoid hidden behind the scenes kernel
> interconnections between subsystems.
> 
Can we? in case of exception. Since all these IOCTLs are coming from the
unreliable user space, we must deal all exceptions.

For example, when user closes /dev/ioasid FD before (or w/o) unbind IOCTL
for VFIO, KVM, kernel must do cleanup and coordinate among subsystems.
In this patchset, we have a per mm(ioasid_set) notifier to inform mdev, KVM
to clean up and drop its refcount. Do you have any suggestion on this?

> 
> > > So when you 'allow' a mdev to access a PASID you want to say:
> > >  Allow Guest PASID A, map it to host PASID B on this /dev/ioasid FD
> > >   
> 
> > Host and guest PASID value, as well as device info are available through
> > iommu_uapi_sva_bind_gpasid(), we just need to feed that info to mdev
> > driver.  
> 
> You need that IOCTL to exist on the *mdev driver*. It is a VFIO ioctl,
> not a iommu or ioasid or sva IOCTL.
>
OK. A separate IOCTL and separate step.

> > > That seems like a good helper library to provide for drivers to use,
> > > but it should be a construct entirely contained in the driver.  
> > why? would it be cleaner if it is in the common code?  
> 
> No, it is the "mid layer" problematic design.
> 
> Having the iommu layer store driver-specific data on behalf of a
> driver will just make a mess. Use the natural layering we have and
> store driver specific data in the driver structs.
> 
> Add a library to help build the datastructure if it necessary.
> 
Let me try to paraphrase, you are suggesting common helper code and data
format but still driver specific storage of the mapping, correct?

Will try this out, seems cleaner.

> Jason


Thanks,

Jacob

Re: [PATCH V4 05/18] iommu/ioasid: Redefine IOASID set and allocation APIs

2021-03-30 Thread Jacob Pan

Hi Jason,

On Tue, 30 Mar 2021 10:43:13 -0300, Jason Gunthorpe  wrote:

> > If two mdevs from the same PF dev are assigned to two VMs, the PASID
> > table will be shared. IOASID set ensures one VM cannot program another
> > VM's PASIDs. I assume 'secure context' is per VM when it comes to host
> > PASID.  
> 
> No, the mdev device driver must enforce this directly. It is the one
> that programms the physical shared HW, it is the one that needs a list
> of PASID's it is allowed to program *for each mdev*
> 
This requires the mdev driver to obtain a list of allowed PASIDs(possibly
during PASID bind time) prior to do enforcement. IMHO, the PASID enforcement
points are:
1. During WQ configuration (e.g.program MSI)
2. During work submission

For VT-d shared workqueue, there is no way to enforce #2 in mdev driver in
that the PASID is obtained from PASID MSR from the CPU and submitted w/o
driver involvement. The enforcement for #2 is in the KVM PASID translation
table, which is per VM.

For our current VFIO mdev model, bind guest page table does not involve
mdev driver. So this is a gap we must fill, i.e. include a callback from
mdev driver?

> ioasid_set doesn't seem to help at all, certainly not as a concept
> tied to /dev/ioasid.
> 
Yes, we can take the security role off ioasid_set once we have per mdev
list. However, ioasid_set being a per VM/mm entity also bridge
communications among kernel subsystems that don't have direct call path.
e.g. KVM, VDCM and IOMMU.

> > No. the mdev driver consults with IOASID core When the guest programs a
> > guest PASID on to he mdev. VDCM driver does a lookup:
> > host_pasid = ioasid_find_by_spid(ioasid_set, guest_pasid);  
> 
> This is the wrong layering. Tell the mdev device directly what it is
> allowed to do. Do not pollute the ioasid core with security stuff.
> 
> > > I'd say you shoul have a single /dev/ioasid per VM and KVM should
> > > attach to that - it should get all the global events/etc that are not
> > > device specific.
> > >   
> > You mean a single /dev/ioasid FD per VM and KVM? I think that is what we
> > are doing in this set. A VM process can only open /dev/ioasid once, then
> > use the FD for allocation and pass the PASID for bind page table etc.  
> 
> Yes, I think that is reasonable.
> 
> Tag all the IOCTL's with the IOASID number.
>  
> > > Not sure what guest-host PASID means, these have to be 1:1 for device
> > > assignment to work - why would use something else for mdev?
> > >   
> > We have G-H PASID translation. They don't have to be 1:1.
> > IOASID Set Private ID (SPID) is intended as a generic solution for
> > guest PASID. Could you review the secion Section: IOASID Set Private ID
> > (SPID) in the doc patch?  
> 
> Again this only works for MDEV? How would you do translation for a
> real PF/VF?
> 
Right, we will need some mediation for PF/VF.

> So when you 'allow' a mdev to access a PASID you want to say:
>  Allow Guest PASID A, map it to host PASID B on this /dev/ioasid FD
> 
> ?
> 
Host and guest PASID value, as well as device info are available through
iommu_uapi_sva_bind_gpasid(), we just need to feed that info to mdev driver.

> That seems like a good helper library to provide for drivers to use,
> but it should be a construct entirely contained in the driver.
why? would it be cleaner if it is in the common code?

Thanks,

Jacob

Re: [PATCH v2 1/4] iommu/vt-d: Enable write protect for supervisor SVM

2021-03-30 Thread Jacob Pan

Hi Guenter,

On Mon, 22 Mar 2021 10:53:38 -0700, Guenter Roeck 
wrote:

> On Tue, Mar 02, 2021 at 02:13:57AM -0800, Jacob Pan wrote:
> > Write protect bit, when set, inhibits supervisor writes to the read-only
> > pages. In supervisor shared virtual addressing (SVA), where page tables
> > are shared between CPU and DMA, IOMMU PASID entry WPE bit should match
> > CR0.WP bit in the CPU.
> > This patch sets WPE bit for supervisor PASIDs if CR0.WP is set.
> > 
> > Signed-off-by: Sanjay Kumar 
> > Signed-off-by: Jacob Pan 
> > ---  
> 
> ia64:defconfig:
> 
> drivers/iommu/intel/pasid.c: In function 'pasid_enable_wpe':
> drivers/iommu/intel/pasid.c:536:22: error: implicit declaration of
> function 'read_cr0' drivers/iommu/intel/pasid.c:539:23: error:
> 'X86_CR0_WP' undeclared
> 
> Maybe it _is_ time to retire ia64 ?

Good catch, sorry for the late reply. I guess otherwise I will have to do
some #ifdef?

There are many basic x86 helpers are missing in ia64.

+Tony

Thanks,

Jacob

Re: [PATCH V4 05/18] iommu/ioasid: Redefine IOASID set and allocation APIs

2021-03-29 Thread Jacob Pan

Hi Jason,

On Mon, 29 Mar 2021 13:31:47 -0300, Jason Gunthorpe  wrote:

> On Wed, Mar 24, 2021 at 12:05:28PM -0700, Jacob Pan wrote:
> 
> > > IMHO a use created PASID is either bound to a mm (current) at creation
> > > time, or it will never be bound to a mm and its page table is under
> > > user control via /dev/ioasid.
> > >   
> > True for PASID used in native SVA bind. But for binding with a guest mm,
> > PASID is allocated first (VT-d virtual cmd interface Spec 10.4.44), the
> > bind with the host IOMMU when vIOMMU PASID cache is invalidated.
> > 
> > Our intention is to have two separate interfaces:
> > 1. /dev/ioasid (allocation/free only)
> > 2. /dev/sva (handles all SVA related activities including page tables)  
> 
> I'm not sure I understand why you'd want to have two things. Doesn't
> that just complicate everything?
> 
> Manipulating the ioasid, including filling it with page tables, seems
> an integral inseperable part of the whole interface. Why have two ?
> 
In one of the earlier discussions, I was made aware of some use cases (by
AMD, iirc) where PASID can be used w/o IOMMU. That is why I tried to keep
ioasid a separate subsystem. Other than that, I don't see an issue
combining the two.

> > > I thought the whole point of something like a /dev/ioasid was to get
> > > away from each and every device creating its own PASID interface?
> > >   
> > yes, but only for the use cases that need to expose PASID to the
> > userspace.  
> 
> Why "but only"? This thing should reach for a higher generality, not
> just be contained to solve some problem within qemu.
> 
I totally agree in terms of generality. I was just trying to point out
existing framework or drivers such as uacce and idxd driver does not have a
need to use /dev/ioasid.

> > > It maybe somewhat reasonable that some devices could have some easy
> > > 'make a SVA PASID on current' interface built in,  
> > I agree, this is the case PASID is hidden from the userspace, right?
> > e.g. uacce.  
> 
> "hidden", I guess, but does it matter so much?
> 
it matters when it comes to which interface to choose. Use /dev/ioasid to
allocate if PASID value cannot be hidden. Use some other interface for bind
current and allocate if a PASID is not visible to the user.

> The PASID would still consume a cgroup credit
> 
yes, credit still consumed. Just the PASID value is hidden.

> > > but anything more
> > > complicated should use /dev/ioasid, and anything consuming PASID
> > > should also have an API to import and attach a PASID from /dev/ioasid.
> > >   
> > Would the above two use cases constitute the "complicated" criteria? Or
> > we should say anything that need the explicit PASID value has to through
> > /dev/ioasid?  
> 
> Anything that needs more that creating a hidden PASID link'd to
> current should use the full interface.
> 
Yes, I think we are on the same page. For example, today's uacce or idxd
driver creates a hidden PASID when user does open(), where a new WQ is
provisioned and bound to current mm. This is the case where /dev/ioasid is
not needed.

> > In terms of usage for guest SVA, an ioasid_set is mostly tied to a host
> > mm, the use case is as the following:  
> 
> From that doc:
> 
>   It is imperative to enforce
>   VM-IOASID ownership such that a malicious guest cannot target DMA
>   traffic outside its own IOASIDs, or free an active IOASID that belongs
>   to another VM.
> 
> Huh?
> 
Sorry, I am not following. In the doc, I have an example to show the
ioasid_set to VM/mm mapping. We use mm as the ioasid_set token to identify
who the owner of an IOASID is. i.e. who allocated the IOASID. Non-owner
cannot perform bind page table or free operations.

Section: IOASID Set Private ID (SPID)
 .--..--.
 |   VM 1   ||   VM 2   |
 |  ||  |
 |--||--|
 | GPASID/SPID 101  || GPASID/SPID 101  |
 '--'---' Guest
 __|__|
   |  |   Host
   v  v
 .--..--.
 | Host IOASID 201  || Host IOASID 202  |
 '--''--'
 |   IOASID set 1   ||   IOASID set 2   |
 '--''--'


> Security in a PASID world comes from the IOMMU blocking access to the
> PASID except from approved PCI-ID's. If a VF/PF is assigned to a guest
> then that guest can cause the device to issue any PASID by having
> complete control and the vIOMMU is

Re: [PATCH V4 05/18] iommu/ioasid: Redefine IOASID set and allocation APIs

2021-03-25 Thread Jacob Pan

Hi Jason,

On Thu, 25 Mar 2021 14:16:45 -0300, Jason Gunthorpe  wrote:

> On Thu, Mar 25, 2021 at 10:02:36AM -0700, Jacob Pan wrote:
> > Hi Jean-Philippe,
> > 
> > On Thu, 25 Mar 2021 11:21:40 +0100, Jean-Philippe Brucker
> >  wrote:
> >   
> > > On Wed, Mar 24, 2021 at 03:12:30PM -0700, Jacob Pan wrote:  
> > > > Hi Jason,
> > > > 
> > > > On Wed, 24 Mar 2021 14:03:38 -0300, Jason Gunthorpe 
> > > > wrote:   
> > > > > On Wed, Mar 24, 2021 at 10:02:46AM -0700, Jacob Pan wrote:
> > > > > > > Also wondering about device driver allocating auxiliary
> > > > > > > domains for their private use, to do iommu_map/unmap on
> > > > > > > private PASIDs (a clean replacement to super SVA, for
> > > > > > > example). Would that go through the same path as /dev/ioasid
> > > > > > > and use the cgroup of current task?  
> > > > > >
> > > > > > For the in-kernel private use, I don't think we should restrict
> > > > > > based on cgroup, since there is no affinity to user processes. I
> > > > > > also think the PASID allocation should just use kernel API
> > > > > > instead of /dev/ioasid. Why would user space need to know the
> > > > > > actual PASID # for device private domains? Maybe I missed your
> > > > > > idea?  
> > > > > 
> > > > > There is not much in the kernel that isn't triggered by a
> > > > > process, I would be careful about the idea that there is a class
> > > > > of users that can consume a cgroup controlled resource without
> > > > > being inside the cgroup.
> > > > > 
> > > > > We've got into trouble before overlooking this and with something
> > > > > greenfield like PASID it would be best built in to the API to
> > > > > prevent a mistake. eg accepting a cgroup or process input to the
> > > > > allocator. 
> > > > Make sense. But I think we only allow charging the current cgroup,
> > > > how about I add the following to ioasid_alloc():
> > > > 
> > > > misc_cg = get_current_misc_cg();
> > > > ret = misc_cg_try_charge(MISC_CG_RES_IOASID, misc_cg, 1);
> > > > if (ret) {
> > > > put_misc_cg(misc_cg);
> > > > return ret;
> > > > }
> > > 
> > > Does that allow PASID allocation during driver probe, in kernel_init
> > > or modprobe context?
> > >   
> > Good point. Yes, you can get cgroup subsystem state in kernel_init for
> > charging/uncharging. I would think module_init should work also since
> > it is after kernel_init. I have tried the following:
> > static int __ref kernel_init(void *unused)
> >  {
> > int ret;
> > +   struct cgroup_subsys_state *css;
> > +   css = task_get_css(current, pids_cgrp_id);
> > 
> > But that would imply:
> > 1. IOASID has to be built-in, not as module
> > 2. IOASIDs charged on PID1/init would not subject to cgroup limit since
> > it will be in the root cgroup and we don't support migration nor will
> > migrate.
> > 
> > Then it comes back to the question of why do we try to limit in-kernel
> > users per cgroup if we can't enforce these cases.  
> 
> Are these real use cases? Why would a driver binding to a device
> create a single kernel pasid at bind time? Why wouldn't it use
> untagged DMA?
> 
For VT-d, I don't see such use cases. All PASID allocations by the kernel
drivers has proper process context.

> When someone needs it they can rework it and explain why they are
> doing something sane.
> 
Agreed.

> Jason


Thanks,

Jacob

Re: [PATCH V4 05/18] iommu/ioasid: Redefine IOASID set and allocation APIs

2021-03-25 Thread Jacob Pan

Hi Jean-Philippe,

On Thu, 25 Mar 2021 11:21:40 +0100, Jean-Philippe Brucker
 wrote:

> On Wed, Mar 24, 2021 at 03:12:30PM -0700, Jacob Pan wrote:
> > Hi Jason,
> > 
> > On Wed, 24 Mar 2021 14:03:38 -0300, Jason Gunthorpe 
> > wrote: 
> > > On Wed, Mar 24, 2021 at 10:02:46AM -0700, Jacob Pan wrote:  
> > > > > Also wondering about device driver allocating auxiliary domains
> > > > > for their private use, to do iommu_map/unmap on private PASIDs (a
> > > > > clean replacement to super SVA, for example). Would that go
> > > > > through the same path as /dev/ioasid and use the cgroup of
> > > > > current task?
> > > >
> > > > For the in-kernel private use, I don't think we should restrict
> > > > based on cgroup, since there is no affinity to user processes. I
> > > > also think the PASID allocation should just use kernel API instead
> > > > of /dev/ioasid. Why would user space need to know the actual PASID
> > > > # for device private domains? Maybe I missed your idea?
> > > 
> > > There is not much in the kernel that isn't triggered by a process, I
> > > would be careful about the idea that there is a class of users that
> > > can consume a cgroup controlled resource without being inside the
> > > cgroup.
> > > 
> > > We've got into trouble before overlooking this and with something
> > > greenfield like PASID it would be best built in to the API to prevent
> > > a mistake. eg accepting a cgroup or process input to the allocator.
> > >   
> > Make sense. But I think we only allow charging the current cgroup, how
> > about I add the following to ioasid_alloc():
> > 
> > misc_cg = get_current_misc_cg();
> > ret = misc_cg_try_charge(MISC_CG_RES_IOASID, misc_cg, 1);
> > if (ret) {
> > put_misc_cg(misc_cg);
> > return ret;
> > }  
> 
> Does that allow PASID allocation during driver probe, in kernel_init or
> modprobe context?
> 
Good point. Yes, you can get cgroup subsystem state in kernel_init for
charging/uncharging. I would think module_init should work also since it is
after kernel_init. I have tried the following:
static int __ref kernel_init(void *unused)
 {
int ret;
+   struct cgroup_subsys_state *css;
+   css = task_get_css(current, pids_cgrp_id);

But that would imply:
1. IOASID has to be built-in, not as module
2. IOASIDs charged on PID1/init would not subject to cgroup limit since it
will be in the root cgroup and we don't support migration nor will migrate.

Then it comes back to the question of why do we try to limit in-kernel
users per cgroup if we can't enforce these cases.

> Thanks,
> Jean
> 


Thanks,

Jacob

Re: [PATCH V4 05/18] iommu/ioasid: Redefine IOASID set and allocation APIs

2021-03-24 Thread Jacob Pan

Hi Jason,

On Wed, 24 Mar 2021 14:03:38 -0300, Jason Gunthorpe  wrote:

> On Wed, Mar 24, 2021 at 10:02:46AM -0700, Jacob Pan wrote:
> > > Also wondering about device driver allocating auxiliary domains for
> > > their private use, to do iommu_map/unmap on private PASIDs (a clean
> > > replacement to super SVA, for example). Would that go through the
> > > same path as /dev/ioasid and use the cgroup of current task?  
> >
> > For the in-kernel private use, I don't think we should restrict based on
> > cgroup, since there is no affinity to user processes. I also think the
> > PASID allocation should just use kernel API instead of /dev/ioasid. Why
> > would user space need to know the actual PASID # for device private
> > domains? Maybe I missed your idea?  
> 
> There is not much in the kernel that isn't triggered by a process, I
> would be careful about the idea that there is a class of users that
> can consume a cgroup controlled resource without being inside the
> cgroup.
> 
> We've got into trouble before overlooking this and with something
> greenfield like PASID it would be best built in to the API to prevent
> a mistake. eg accepting a cgroup or process input to the allocator.
> 
Make sense. But I think we only allow charging the current cgroup, how about
I add the following to ioasid_alloc():

misc_cg = get_current_misc_cg();
ret = misc_cg_try_charge(MISC_CG_RES_IOASID, misc_cg, 1);
if (ret) {
put_misc_cg(misc_cg);
return ret;
}

BTW, IOASID will be one of the resources under the proposed misc cgroup.

Thanks,

Jacob

Re: [PATCH V4 05/18] iommu/ioasid: Redefine IOASID set and allocation APIs

2021-03-24 Thread Jacob Pan

Hi Jason,

On Mon, 22 Mar 2021 09:03:00 -0300, Jason Gunthorpe  wrote:

> On Fri, Mar 19, 2021 at 11:22:21AM -0700, Jacob Pan wrote:
> > Hi Jason,
> > 
> > On Fri, 19 Mar 2021 10:54:32 -0300, Jason Gunthorpe 
> > wrote: 
> > > On Fri, Mar 19, 2021 at 02:41:32PM +0100, Jean-Philippe Brucker
> > > wrote:  
> > > > On Fri, Mar 19, 2021 at 09:46:45AM -0300, Jason Gunthorpe wrote:
> > > > > On Fri, Mar 19, 2021 at 10:58:41AM +0100, Jean-Philippe Brucker
> > > > > wrote: 
> > > > > > Although there is no use for it at the moment (only two upstream
> > > > > > users and it looks like amdkfd always uses current too), I quite
> > > > > > like the client-server model where the privileged process does
> > > > > > bind() and programs the hardware queue on behalf of the client
> > > > > > process.
> > > > > 
> > > > > This creates a lot complexity, how do does process A get a secure
> > > > > reference to B? How does it access the memory in B to setup the
> > > > > HW?
> > > > 
> > > > mm_access() for example, and passing addresses via IPC
> > > 
> > > I'd rather the source process establish its own PASID and then pass
> > > the rights to use it to some other process via FD passing than try to
> > > go the other way. There are lots of security questions with something
> > > like mm_access.
> > >   
> > 
> > Thank you all for the input, it sounds like we are OK to remove mm
> > argument from iommu_sva_bind_device() and iommu_sva_alloc_pasid() for
> > now?
> > 
> > Let me try to summarize PASID allocation as below:
> > 
> > Interfaces  | Usage |  Limit| bind¹ |User visible
> > /dev/ioasid²| G-SVA/IOVA|  cgroup   | No
> > |Yes char dev³  | SVA   |  cgroup   |
> > Yes |No iommu driver| default PASID|  no
> > | No|No kernel  | super SVA | no
> > | yes   |No
> > 
> > ¹ Allocated during SVA bind
> > ² PASIDs allocated via /dev/ioasid are not bound to any mm. But its
> >   ownership is assigned to the process that does the allocation.  
> 
> What does "not bound to a mm" mean?
> 
I meant, the IOASID allocated via /dev/ioasid is in a clean state (just a
number). It's initial state is not bound to an mm. Unlike, sva_bind_device()
where the IOASID is allocated during bind time.

The use case is to support guest SVA bind, where allocation and bind are in
two separate steps.

> IMHO a use created PASID is either bound to a mm (current) at creation
> time, or it will never be bound to a mm and its page table is under
> user control via /dev/ioasid.
> 
True for PASID used in native SVA bind. But for binding with a guest mm,
PASID is allocated first (VT-d virtual cmd interface Spec 10.4.44), the
bind with the host IOMMU when vIOMMU PASID cache is invalidated.

Our intention is to have two separate interfaces:
1. /dev/ioasid (allocation/free only)
2. /dev/sva (handles all SVA related activities including page tables)

> I thought the whole point of something like a /dev/ioasid was to get
> away from each and every device creating its own PASID interface?
> 
yes, but only for the use cases that need to expose PASID to the userspace.
AFAICT, the cases are:
1. guest SVA (bind guest mm)
2. full PF/VF assignment(not mediated) where guest driver want to program
the actual PASID onto the device.

> It maybe somewhat reasonable that some devices could have some easy
> 'make a SVA PASID on current' interface built in,
I agree, this is the case PASID is hidden from the userspace, right? e.g.
uacce.

> but anything more
> complicated should use /dev/ioasid, and anything consuming PASID
> should also have an API to import and attach a PASID from /dev/ioasid.
> 
Would the above two use cases constitute the "complicated" criteria? Or we
should say anything that need the explicit PASID value has to through
/dev/ioasid?

Could you give some highlevel hint on the APIs that hook up IOASID
allocated from /dev/ioasid and use cases that combine device and domain
information? Yi is working on /dev/sva RFC, it would be good to have a
direction check.

> > Currently, the proposed /dev/ioasid interface does not map individual
> > PASID with an FD. The FD is at the ioasid_set granularity and bond to
> > the current mm. We could extend the IOCTLs to cover individual PASID-FD
> > passing case when use cases arise. Would this work?  
> 
> Is it a good idea that the FD is per ioasid_set ?
We were thinking the allocation IOCTL is on a per set basis, then we know

Re: [PATCH V4 05/18] iommu/ioasid: Redefine IOASID set and allocation APIs

2021-03-24 Thread Jacob Pan

Hi Jean-Philippe,

On Mon, 22 Mar 2021 10:24:00 +0100, Jean-Philippe Brucker
 wrote:

> On Fri, Mar 19, 2021 at 11:22:21AM -0700, Jacob Pan wrote:
> > Hi Jason,
> > 
> > On Fri, 19 Mar 2021 10:54:32 -0300, Jason Gunthorpe 
> > wrote: 
> > > On Fri, Mar 19, 2021 at 02:41:32PM +0100, Jean-Philippe Brucker
> > > wrote:  
> > > > On Fri, Mar 19, 2021 at 09:46:45AM -0300, Jason Gunthorpe wrote:
> > > > > On Fri, Mar 19, 2021 at 10:58:41AM +0100, Jean-Philippe Brucker
> > > > > wrote: 
> > > > > > Although there is no use for it at the moment (only two upstream
> > > > > > users and it looks like amdkfd always uses current too), I quite
> > > > > > like the client-server model where the privileged process does
> > > > > > bind() and programs the hardware queue on behalf of the client
> > > > > > process.
> > > > > 
> > > > > This creates a lot complexity, how do does process A get a secure
> > > > > reference to B? How does it access the memory in B to setup the
> > > > > HW?
> > > > 
> > > > mm_access() for example, and passing addresses via IPC
> > > 
> > > I'd rather the source process establish its own PASID and then pass
> > > the rights to use it to some other process via FD passing than try to
> > > go the other way. There are lots of security questions with something
> > > like mm_access.
> > >   
> > 
> > Thank you all for the input, it sounds like we are OK to remove mm
> > argument from iommu_sva_bind_device() and iommu_sva_alloc_pasid() for
> > now?  
> 
> Fine by me. By the way the IDXD currently missues the bind API for
> supervisor PASID, and the drvdata parameter isn't otherwise used. This
> would be a good occasion to clean both. The new bind prototype could be:
> 
> struct iommu_sva *iommu_sva_bind_device(struct device *dev, int flags)
> 
yes, we really just hijacked drvdata as flags, it would be cleaner to use
flags explicitly.

> And a flag IOMMU_SVA_BIND_SUPERVISOR (not that I plan to implement it in
> the SMMU, but I think we need to clean the current usage)
> 
You mean move #define SVM_FLAG_SUPERVISOR_MODE out of Intel code to be a
generic flag in iommu-sva-lib.h called IOMMU_SVA_BIND_SUPERVISOR?

I agree if that is the proposal.

> > 
> > Let me try to summarize PASID allocation as below:
> > 
> > Interfaces  | Usage |  Limit| bind¹ |User visible
> > 
> > /dev/ioasid²| G-SVA/IOVA|  cgroup   | No
> > |Yes
> > 
> > char dev³   | SVA   |  cgroup   | Yes   |No
> > 
> > iommu driver| default PASID|  no| No|No
> >  
> 
> Is this PASID #0?
> 
True for native case but not limited to PASID#0 for guest case. E.g. for
mdev assignment with guest IOVA, the guest PASID would #0, but the host aux
domain default PASID can be non-zero. Here I meant to include both cases.

> > 
> > kernel  | super SVA | no| yes   |No
> >   
> 
> Also wondering about device driver allocating auxiliary domains for their
> private use, to do iommu_map/unmap on private PASIDs (a clean replacement
> to super SVA, for example). Would that go through the same path as
> /dev/ioasid and use the cgroup of current task?
>
For the in-kernel private use, I don't think we should restrict based on
cgroup, since there is no affinity to user processes. I also think the
PASID allocation should just use kernel API instead of /dev/ioasid. Why
would user space need to know the actual PASID # for device private domains?
Maybe I missed your idea?

> Thanks,
> Jean
> 
> > 
> > ¹ Allocated during SVA bind
> > ² PASIDs allocated via /dev/ioasid are not bound to any mm. But its
> >   ownership is assigned to the process that does the allocation.
> > ³ Include uacce, other private device driver char dev such as idxd
> > 
> > Currently, the proposed /dev/ioasid interface does not map individual
> > PASID with an FD. The FD is at the ioasid_set granularity and bond to
> > the current mm. We could extend the IOCTLs to cover individual PASID-FD
> > passing case when use cases arise. Would this work?
> > 
> > Thanks,
> > 
> > Jacob  


Thanks,

Jacob

Re: [Patch v3 1/2] cgroup: sev: Add misc cgroup controller

2021-03-24 Thread Jacob Pan

Hi Vipin,

On Mon, 22 Mar 2021 11:54:39 -0700, Vipin Sharma  wrote:

> On Fri, Mar 19, 2021 at 02:28:01PM -0700, Jacob Pan wrote:
> > On Thu,  4 Mar 2021 15:19:45 -0800, Vipin Sharma 
> > wrote:  
> > > +#ifndef _MISC_CGROUP_H_
> > > +#define _MISC_CGROUP_H_
> > > +  
> > nit: should you do #include ?
> > Otherwise, css may be undefined.  
> 
> User of this controller will use get_curernt_misc_cg() API which returns
> a pointer. Ideally the user should use this pointer and they shouldn't
> have any need to access "css" in their code. They also don't need to
> create a object of 'struct misc_cg{}', because that won't be correct misc
> cgroup object. They should just declare a pointer like we are doing here
> in 'struct kvm_sev_info {}'.
> 
> If they do need to use "css" then they can include cgroup header in their
> code.
> 
I didn't mean the users of misc_cgroup will use css directly. I meant if I
want to use misc cgruop in ioasid.c, I have to do the following to avoid
undefined css:
#include 
#include 

So it might be simpler if you do #include  inside
misc_cgroup.h. Then in ioasid.c, I only need to do
#include .

> Let me know if I am overlooking something here.
> 
> Thanks
> Vipin Sharma


Thanks,

Jacob

Re: [Patch v3 1/2] cgroup: sev: Add misc cgroup controller

2021-03-19 Thread Jacob Pan

Hi Vipin,

On Thu,  4 Mar 2021 15:19:45 -0800, Vipin Sharma  wrote:

> +/* SPDX-License-Identifier: GPL-2.0 */
> +/*
> + * Miscellaneous cgroup controller.
> + *
> + * Copyright 2020 Google LLC
> + * Author: Vipin Sharma 
> + */
> +#ifndef _MISC_CGROUP_H_
> +#define _MISC_CGROUP_H_
> +
nit: should you do #include ?
Otherwise, css may be undefined.

> +/**
> + * Types of misc cgroup entries supported by the host.
> + */

Thanks,

Jacob

Re: [PATCH V4 05/18] iommu/ioasid: Redefine IOASID set and allocation APIs

2021-03-19 Thread Jacob Pan

Hi Jason,

On Fri, 19 Mar 2021 10:54:32 -0300, Jason Gunthorpe  wrote:

> On Fri, Mar 19, 2021 at 02:41:32PM +0100, Jean-Philippe Brucker wrote:
> > On Fri, Mar 19, 2021 at 09:46:45AM -0300, Jason Gunthorpe wrote:  
> > > On Fri, Mar 19, 2021 at 10:58:41AM +0100, Jean-Philippe Brucker wrote:
> > >   
> > > > Although there is no use for it at the moment (only two upstream
> > > > users and it looks like amdkfd always uses current too), I quite
> > > > like the client-server model where the privileged process does
> > > > bind() and programs the hardware queue on behalf of the client
> > > > process.  
> > > 
> > > This creates a lot complexity, how do does process A get a secure
> > > reference to B? How does it access the memory in B to setup the HW?  
> > 
> > mm_access() for example, and passing addresses via IPC  
> 
> I'd rather the source process establish its own PASID and then pass
> the rights to use it to some other process via FD passing than try to
> go the other way. There are lots of security questions with something
> like mm_access.
> 

Thank you all for the input, it sounds like we are OK to remove mm argument
from iommu_sva_bind_device() and iommu_sva_alloc_pasid() for now?

Let me try to summarize PASID allocation as below:

Interfaces  | Usage |  Limit| bind¹ |User visible

/dev/ioasid²| G-SVA/IOVA|  cgroup   | No|Yes

char dev³   | SVA   |  cgroup   | Yes   |No

iommu driver| default PASID|  no| No|No

kernel  | super SVA | no| yes   |No

¹ Allocated during SVA bind
² PASIDs allocated via /dev/ioasid are not bound to any mm. But its
  ownership is assigned to the process that does the allocation.
³ Include uacce, other private device driver char dev such as idxd

Currently, the proposed /dev/ioasid interface does not map individual PASID
with an FD. The FD is at the ioasid_set granularity and bond to the current
mm. We could extend the IOCTLs to cover individual PASID-FD passing case
when use cases arise. Would this work?

Thanks,

Jacob

Re: [PATCH V4 05/18] iommu/ioasid: Redefine IOASID set and allocation APIs

2021-03-19 Thread Jacob Pan

Hi Jean-Philippe,

On Fri, 19 Mar 2021 10:58:41 +0100, Jean-Philippe Brucker
 wrote:

> > Slightly off the title. As we are moving to use cgroup to limit PASID
> > allocations, it would be much simpler if we enforce on the current
> > task.  
> 
> Yes I think we should do that. Is there a problem with charging the
> process that does the PASID allocation even if the PASID indexes some
> other mm?
Besides complexity, my second concern is that we are sharing the misc
cgroup controller with other resources that do not have such behavior.

Cgroup v2 also has unified hierarchy which also requires coherent behavior
among controllers.

Thanks,

Jacob

Re: [PATCH V4 05/18] iommu/ioasid: Redefine IOASID set and allocation APIs

2021-03-18 Thread Jacob Pan

Hi Jean,

Slightly off the title. As we are moving to use cgroup to limit PASID
allocations, it would be much simpler if we enforce on the current task.

However, iommu_sva_alloc_pasid() takes an mm_struct pointer as argument
which implies it can be something other the the current task mm. So far all
kernel callers use current task mm. Is there a use case for doing PASID
allocation on behalf of another mm? If not, can we remove the mm argument?

Thanks,

Jacob

>  /**
>   * iommu_sva_alloc_pasid - Allocate a PASID for the mm
> @@ -35,11 +44,11 @@ int iommu_sva_alloc_pasid(struct mm_struct *mm,
> ioasid_t min, ioasid_t max) mutex_lock(_sva_lock);
>   if (mm->pasid) {
>   if (mm->pasid >= min && mm->pasid <= max)
> - ioasid_get(mm->pasid);
> + ioasid_get(iommu_sva_pasid, mm->pasid);
>   else
>   ret = -EOVERFLOW;
>   } else {
> - pasid = ioasid_alloc(_sva_pasid, min, max, mm);
> + pasid = ioasid_alloc(iommu_sva_pasid, min, max, mm);
>   if (pasid == INVALID_IOASID)
>   ret = -ENOMEM;

Thanks,

Jacob

Re: [RFC v2 2/2] cgroup: sev: Miscellaneous cgroup documentation.

2021-03-16 Thread Jacob Pan

Hi Tejun,

On Mon, 15 Mar 2021 22:22:12 -0400, Tejun Heo  wrote:

> On Mon, Mar 15, 2021 at 06:30:30PM -0700, Jacob Pan wrote:
> > I don't know if this is required. I thought utilities such as cgclassify
> > need to be supported.
> > " cgclassify - move running task(s) to given cgroups "
> > If no such use case, I am fine with dropping the migration support. Just
> > enforce limit on allocations.  
> 
> Yeah, that's what all other controllers do. Please read the in-tree
> cgroup2 doc.
> 
Thanks for your patience and guidance, will try to merge with misc
controller and go from there.

Thanks,

Jacob

Re: [RFC v2 2/2] cgroup: sev: Miscellaneous cgroup documentation.

2021-03-15 Thread Jacob Pan

Hi Tejun,

On Mon, 15 Mar 2021 19:54:36 -0400, Tejun Heo  wrote:

> Hello,
> 
> On Mon, Mar 15, 2021 at 04:40:12PM -0700, Jacob Pan wrote:
> > 2. then we want to move/migrate Process1 to cg_B. so we need uncharge
> > 10 of cg_A, charge 10 of cg_B  
> 
> So, what I don't get is why this migration is necessary. This isn't
> supported as a usage pattern and no one, at least in terms of wide-spread
> usage, does this. Why is this a requirement for your use case?
> 
I don't know if this is required. I thought utilities such as cgclassify
need to be supported.
" cgclassify - move running task(s) to given cgroups "
If no such use case, I am fine with dropping the migration support. Just
enforce limit on allocations.

> Thanks.
> 


Thanks,

Jacob

Re: [RFC v2 2/2] cgroup: sev: Miscellaneous cgroup documentation.

2021-03-15 Thread Jacob Pan

Hi Tejun,

On Mon, 15 Mar 2021 18:19:35 -0400, Tejun Heo  wrote:

> Hello,
> 
> On Mon, Mar 15, 2021 at 03:11:55PM -0700, Jacob Pan wrote:
> > > Migration itself doesn't have restrictions but all resources are
> > > distributed on the same hierarchy, so the controllers are supposed to
> > > follow the same conventions that can be implemented by all
> > > controllers. 
> > Got it, I guess that is the behavior required by the unified hierarchy.
> > Cgroup v1 would be ok? But I am guessing we are not extending on v1?  
> 
> A new cgroup1 only controller is unlikely to be accpeted.
> 
> > The IOASIDs are programmed into devices to generate DMA requests tagged
> > with them. The IOMMU has a per device IOASID table with each entry has
> > two pointers:
> >  - the PGD of the guest process.
> >  - the PGD of the host process
> > 
> > The result of this 2 stage/nested translation is that we can share
> > virtual address (SVA) between guest process and DMA. The host process
> > needs to allocate multiple IOASIDs since one IOASID is needed for each
> > guest process who wants SVA.
> > 
> > The DMA binding among device-IOMMU-process is setup via a series of user
> > APIs (e.g. via VFIO).
> > 
> > If a process calls fork(), the children does not inherit the IOASIDs and
> > their bindings. Children who wish to use SVA has to call those APIs to
> > establish the binding for themselves.
> > 
> > Therefore, if a host process allocates 10 IOASIDs then does a
> > fork()/clone(), it cannot charge 10 IOASIDs in the new cgroup. i.e. the
> > 10 IOASIDs stays with the process wherever it goes.
> > 
> > I feel this fit in the domain model, true?  
> 
> I still don't get where migration is coming into the picture. Who's
> migrating where?
> 
Sorry, perhaps I can explain by an example.

There are two cgroups: cg_A and cg_B with limit set to 20 for both. Process1
is in cg_A. The initial state is:
cg_A/ioasid.current=0, cg_A/ioasid.max=20
cg_B/ioasid.current=0, cg_B/ioasid.max=20

Now, consider the following steps:

1. Process1 allocated 10 IOASIDs,
cg_A/ioasid.current=10,
cg_B/ioasid.current=0

2. then we want to move/migrate Process1 to cg_B. so we need uncharge 10 of
cg_A, charge 10 of cg_B

3. After the migration, I expect
cg_A/ioasid.current=0,
cg_B/ioasid.current=10

We don't enforce the limit during this organizational change since we can't
force free IOASIDs. But any new allocations will be subject to the limit
set in ioasid.max.

> Thanks.
> 


Thanks,

Jacob

Re: [RFC v2 2/2] cgroup: sev: Miscellaneous cgroup documentation.

2021-03-15 Thread Jacob Pan

Hi Tejun,

On Sat, 13 Mar 2021 13:05:36 -0500, Tejun Heo  wrote:

> Hello,
> 
> On Sat, Mar 13, 2021 at 08:57:01AM -0800, Jacob Pan wrote:
> > Isn't PIDs controller doing the charge/uncharge? I was under the
> > impression that each resource can be independently charged/uncharged,
> > why it affects other resources? Sorry for the basic question.  
> 
> Yeah, PID is an exception as we needed the initial migration to seed new
> cgroups and it gets really confusing with other ways to observe the
> processes - e.g. if you follow the original way of creating a cgroup,
> forking and then moving the seed process into the target cgroup, if we
> don't migrate the pid charge together, the numbers wouldn't agree and the
> seeder cgroup may end up running out of pids if there are any
> restrictions.
> 
Thanks for explaining. Unfortunately, it seems IOASIDs has a similar needs
in terms of migrating the charge.

> > I also didn't quite get the limitation on cgroup v2 migration, this is
> > much simpler than memcg. Could you give me some pointers?  
> 
> Migration itself doesn't have restrictions but all resources are
> distributed on the same hierarchy, so the controllers are supposed to
> follow the same conventions that can be implemented by all controllers.
> 
Got it, I guess that is the behavior required by the unified hierarchy.
Cgroup v1 would be ok? But I am guessing we are not extending on v1?

> > BTW, since the IOASIDs are used to tag DMA and bound with guest
> > process(mm) for shared virtual addressing. fork() cannot be supported,
> > so I guess clone is not a solution here.  
> 
> Can you please elaborate what wouldn't work? The new spawning into a new
> cgroup w/ clone doesn't really change the usage model. It's just a neater
> way to seed a new cgroup. If you're saying that the overall usage model
> doesn't fit the needs of IOASIDs, it likely shouldn't be a cgroup
> controller.
> 
The IOASIDs are programmed into devices to generate DMA requests tagged
with them. The IOMMU has a per device IOASID table with each entry has two
pointers:
 - the PGD of the guest process.
 - the PGD of the host process

The result of this 2 stage/nested translation is that we can share virtual
address (SVA) between guest process and DMA. The host process needs to
allocate multiple IOASIDs since one IOASID is needed for each guest process
who wants SVA.

The DMA binding among device-IOMMU-process is setup via a series of user
APIs (e.g. via VFIO).

If a process calls fork(), the children does not inherit the IOASIDs and
their bindings. Children who wish to use SVA has to call those APIs to
establish the binding for themselves.

Therefore, if a host process allocates 10 IOASIDs then does a
fork()/clone(), it cannot charge 10 IOASIDs in the new cgroup. i.e. the 10
IOASIDs stays with the process wherever it goes.

I feel this fit in the domain model, true?

> Thanks.
> 


Thanks,

Jacob

Re: [RFC v2 2/2] cgroup: sev: Miscellaneous cgroup documentation.

2021-03-13 Thread Jacob Pan

Hi Tejun,

On Sat, 13 Mar 2021 05:20:39 -0500, Tejun Heo  wrote:

> On Fri, Mar 12, 2021 at 02:59:04PM -0800, Jacob Pan wrote:
> > Our primary goal is to limit the amount of IOASIDs that VMs can
> > allocate. If a VM is migrated to a different cgroup, I think we need to
> > charge/uncharge the destination/source cgroup in order enforce the
> > limit. I am not an expert here, any feedback would be appreciated.  
> 
> That simply isn't a supported usage model. None of other resources will
> get tracked if you do that.
Isn't PIDs controller doing the charge/uncharge? I was under the impression
that each resource can be independently charged/uncharged, why it affects
other resources? Sorry for the basic question.

I also didn't quite get the limitation on cgroup v2 migration, this is much
simpler than memcg. Could you give me some pointers?

BTW, since the IOASIDs are used to tag DMA and bound with guest process(mm)
for shared virtual addressing. fork() cannot be supported, so I guess clone
is not a solution here.

Thanks,

Jacob

Re: [RFC v2 2/2] cgroup: sev: Miscellaneous cgroup documentation.

2021-03-12 Thread Jacob Pan

Hi Vipin,

On Fri, 12 Mar 2021 13:15:14 -0800, Vipin Sharma  wrote:

> On Fri, Mar 12, 2021 at 12:58:21PM -0800, Jacob Pan wrote:
> > Hi Vipin & Tejun,
> > 
> > Sorry for the late reply, I sent from a different email address than I
> > intended. Please see my comments inline.
> > 
> > 
> > On Thu, 4 Mar 2021 03:51:16 -0500, Tejun Heo  wrote:
> >   
> > > Hello,
> > > 
> > > On Wed, Mar 03, 2021 at 10:22:03PM -0800, Vipin Sharma wrote:  
> > > > > I am trying to see if IOASIDs cgroup can also fit in this misc
> > > > > controller as yet another resource type.
> > > > > https://lore.kernel.org/linux-iommu/20210303131726.7a8cb169@jacob-builder/T/#u
> > > > > However, unlike sev IOASIDs need to be migrated if the process is
> > > > > moved to another cgroup. i.e. charge the destination and uncharge
> > > > > the source.
> > > > > 
> > > > > Do you think this behavior can be achieved by differentiating
> > > > > resource types? i.e. add attach callbacks for certain types.
> > > > > Having a single misc interface seems cleaner than creating
> > > > > another controller.
> > > > 
> > > > I think it makes sense to add support for migration for the
> > > > resources which need it. Resources like SEV, SEV-ES will not
> > > > participate in migration and won't stop can_attach() to succeed,
> > > > other resources which need migration will allow or stop based on
> > > > their limits and capacity in the destination.
> > >   
> > Sounds good. Perhaps some capability/feature flags for each resource
> > such that different behavior can be accommodated?
> > Could you please include me in your future posting? I will rebase on
> > yours.  
> 
> Hi Jacob
> 
> Based on Tejun's response, I will not add charge migration support in
> misc controller.
> 
Sounds good. I need some confirmation on whether migration is a must have
for VMs allocated IOASIDs.
Our primary goal is to limit the amount of IOASIDs that VMs can allocate.
If a VM is migrated to a different cgroup, I think we need to
charge/uncharge the destination/source cgroup in order enforce the limit. I
am not an expert here, any feedback would be appreciated.

> I can definitly add you in my future posting, if you still wanna use it
> without charge migration support.
> 
Yes, please. I got your v3 already, so just future patches.

> Thanks
> Vipin


Thanks,

Jacob

Re: [RFC v2 2/2] cgroup: sev: Miscellaneous cgroup documentation.

2021-03-12 Thread Jacob Pan

Hi Vipin & Tejun,

Sorry for the late reply, I sent from a different email address than I
intended. Please see my comments inline.


On Thu, 4 Mar 2021 03:51:16 -0500, Tejun Heo  wrote:

> Hello,
> 
> On Wed, Mar 03, 2021 at 10:22:03PM -0800, Vipin Sharma wrote:
> > > I am trying to see if IOASIDs cgroup can also fit in this misc
> > > controller as yet another resource type.
> > > https://lore.kernel.org/linux-iommu/20210303131726.7a8cb169@jacob-builder/T/#u
> > > However, unlike sev IOASIDs need to be migrated if the process is
> > > moved to another cgroup. i.e. charge the destination and uncharge the
> > > source.
> > > 
> > > Do you think this behavior can be achieved by differentiating resource
> > > types? i.e. add attach callbacks for certain types. Having a single
> > > misc interface seems cleaner than creating another controller.  
> > 
> > I think it makes sense to add support for migration for the resources
> > which need it. Resources like SEV, SEV-ES will not participate in
> > migration and won't stop can_attach() to succeed, other resources which
> > need migration will allow or stop based on their limits and capacity in
> > the destination.  
> 
Sounds good. Perhaps some capability/feature flags for each resource such
that different behavior can be accommodated?
Could you please include me in your future posting? I will rebase on yours.

> Please note that cgroup2 by and large don't really like or support charge
> migration or even migrations themselves. We tried that w/ memcg on cgroup1
> and it turned out horrible. The expected usage model as decribed in the
> doc is using migration to seed a cgroup (or even better, use the new
> clone call to start in the target cgroup) and then stay there until exit.
> All existing controllers assume this usage model and I'm likely to nack
> deviation unless there are some super strong justifications.
> 
Thank you so much for the pointers. Just to be clear, you meant
1. Use clone3 CLONE_INTO_CGROUP to put the child into a different cgroup.
2. Do not support migration of the parent (existing proc)

Thanks,

Jacob

Re: [RFC PATCH 18/18] ioasid: Add /dev/ioasid for userspace

2021-03-11 Thread Jacob Pan

Hi Jason,

Thanks for the review.

On Wed, 10 Mar 2021 15:23:01 -0400, Jason Gunthorpe  wrote:

> On Sat, Feb 27, 2021 at 02:01:26PM -0800, Jacob Pan wrote:
> 
> > +/*  IOCTLs for IOASID file descriptor (/dev/ioasid)  */
> > +
> > +/**
> > + * IOASID_GET_API_VERSION - _IO(IOASID_TYPE, IOASID_BASE + 0)
> > + *
> > + * Report the version of the IOASID API.  This allows us to bump the
> > entire
> > + * API version should we later need to add or change features in
> > incompatible
> > + * ways.
> > + * Return: IOASID_API_VERSION
> > + * Availability: Always
> > + */
> > +#define IOASID_GET_API_VERSION _IO(IOASID_TYPE,
> > IOASID_BASE + 0)  
> 
> I think this is generally a bad idea, if you change the API later then
> also change the ioctl numbers and everything should work out
> 
> eg use the 4th argument to IOC to specify something about the ABI
> 
Let me try to understand the idea, do you mean something like this?
#define IOASID_GET_INFO _IOC(_IOC_NONE, IOASID_TYPE, IOASID_BASE + 1,
sizeof(struct ioasid_info))

If we later change the size of struct ioasid_info, IOASID_GET_INFO would be
a different ioctl number. Then we will break the existing user space that
uses the old number. So I am guessing you meant we need to have a different
name also. i.e.

#define IOASID_GET_INFO_V2 _IOC(_IOC_NONE, IOASID_TYPE, IOASID_BASE + 1,
sizeof(struct ioasid_info_v2))

We can get rid of the API version, just have individual IOCTL version.
Is that right?

> Jason


Thanks,

Jacob

Re: [RFC PATCH 15/18] cgroup: Introduce ioasids controller

2021-03-05 Thread Jacob Pan

Hi Jean-Philippe,

On Fri, 5 Mar 2021 09:30:49 +0100, Jean-Philippe Brucker
 wrote:

> On Thu, Mar 04, 2021 at 09:46:03AM -0800, Jacob Pan wrote:
> > Hi Jean-Philippe,
> > 
> > On Thu, 4 Mar 2021 10:49:37 +0100, Jean-Philippe Brucker
> >  wrote:
> >   
> > > On Wed, Mar 03, 2021 at 04:02:05PM -0800, Jacob Pan wrote:  
> > > > Hi Jacob,
> > > > 
> > > > On Wed, 3 Mar 2021 13:17:26 -0800, Jacob Pan
> > > >  wrote:
> > > > 
> > > > > Hi Tejun,
> > > > > 
> > > > > On Wed, 3 Mar 2021 10:44:28 -0500, Tejun Heo 
> > > > > wrote: 
> > > > > > On Sat, Feb 27, 2021 at 02:01:23PM -0800, Jacob Pan wrote:  
> > > > > > > IOASIDs are used to associate DMA requests with virtual
> > > > > > > address spaces. They are a system-wide limited resource made
> > > > > > > available to the userspace applications. Let it be VMs or
> > > > > > > user-space device drivers.
> > > > > > > 
> > > > > > > This RFC patch introduces a cgroup controller to address the
> > > > > > > following problems:
> > > > > > > 1. Some user applications exhaust all the available IOASIDs
> > > > > > > thus depriving others of the same host.
> > > > > > > 2. System admins need to provision VMs based on their needs
> > > > > > > for IOASIDs, e.g. the number of VMs with assigned devices
> > > > > > > that perform DMA requests with PASID.
> > > > > > 
> > > > > > Please take a look at the proposed misc controller:
> > > > > > 
> > > > > >  
> > > > > > http://lkml.kernel.org/r/20210302081705.1990283-2-vipi...@google.com
> > > > > > 
> > > > > > Would that fit your bill?  
> > > > > The interface definitely can be reused. But IOASID has a different
> > > > > behavior in terms of migration and ownership checking. I guess
> > > > > SEV key IDs are not tied to a process whereas IOASIDs are.
> > > > > Perhaps this can be solved by adding
> > > > > + .can_attach = ioasids_can_attach,
> > > > > + .cancel_attach  = ioasids_cancel_attach,
> > > > > Let me give it a try and come back.
> > > > > 
> > > > While I am trying to fit the IOASIDs cgroup in to the misc cgroup
> > > > proposal. I'd like to have a direction check on whether this idea of
> > > > using cgroup for IOASID/PASID resource management is viable.
> > > 
> > > Yes, even for host SVA it would be good to have a cgroup. Currently
> > > the number of shared address spaces is naturally limited by number of
> > > processes, which can be controlled with rlimit and cgroup. But on Arm
> > > the hardware limit on shared address spaces is 64k (number of ASIDs),
> > > easily exhausted with the default PASID and PID limits. So a cgroup
> > > for managing this resource is more than welcome.
> > > 
> > > It looks like your current implementation is very dependent on
> > > IOASID_SET_TYPE_MM?  I'll need to do more reading about cgroup to see
> > > how easily it can be adapted to host SVA which uses
> > > IOASID_SET_TYPE_NULL. 
> > Right, I was assuming have three use cases of IOASIDs:
> > 1. host supervisor SVA (not a concern, just one init_mm to bind)
> > 2. host user SVA, either one IOASID per process or perhaps some private
> > IOASID for private address space
> > 3. VM use for guest SVA, each IOASID is bound to a guest process
> > 
> > My current cgroup proposal applies to #3 with IOASID_SET_TYPE_MM, which
> > is allocated by the new /dev/ioasid interface.
> > 
> > For #2, I was thinking you can limit the host process via PIDs cgroup?
> > i.e. limit fork.  
> 
> That works but isn't perfect, because the hardware resource of shared
> address spaces can be much lower that PID limit - 16k ASIDs on Arm. To
> allow an admin to fairly distribute that resource we could introduce
> another cgroup just to limit the number of shared address spaces, but
> limiting the number of IOASIDs does the trick.
> 
make sense. it would be cleaner to have a single approach to limit IOASIDs
(as Jason asked).

> > So the host IOASIDs are currently allocated from the system pool
> > with quota of chosen by iommu_sva_init() in my patch, 0 means unlimited
> > use whatever is available. https://lkml.org/lkml/2021/2/28/18  
> 
> Yes that's sensible, but it would be good to plan the cgroup user
> interface to work for #2 as well, even if we don't implement it right
> away.
> 
will do it in the next version.

> Thanks,
> Jean


Thanks,

Jacob

Re: [RFC PATCH 15/18] cgroup: Introduce ioasids controller

2021-03-04 Thread Jacob Pan

Hi Jason,

On Thu, 4 Mar 2021 15:02:53 -0400, Jason Gunthorpe  wrote:

> On Thu, Mar 04, 2021 at 11:01:44AM -0800, Jacob Pan wrote:
> 
> > > For something like qemu I'd expect to put the qemu process in a cgroup
> > > with 1 PASID. Who cares what qemu uses the PASID for, or how it was
> > > allocated?  
> > 
> > For vSVA, we will need one PASID per guest process. But that is up to
> > the admin based on whether or how many SVA capable devices are directly
> > assigned.  
> 
> I hope the virtual IOMMU driver can communicate the PASID limit and
> the cgroup machinery in the guest can know what the actual limit is.
> 
For VT-d, emulated vIOMMU can communicate with the guest IOMMU driver on how
many PASID bits are supported (extended cap reg PASID size fields). But it
cannot communicate how many PASIDs are in the pool(host cgroup capacity).

The QEMU process may not be the only one in a cgroup so it cannot give hard
guarantees. I don't see a good way to communicate accurately at runtime as
the process migrates or limit changes.

We were thinking to adopt the "Limits" model as defined in the cgroup-v2
doc.
"
Limits
--

A child can only consume upto the configured amount of the resource.
Limits can be over-committed - the sum of the limits of children can
exceed the amount of resource available to the parent.
"

So the guest cgroup would still think it has full 20 bits of PASID at its
disposal. But PASID allocation may fail before reaching the full 20 bits
(2M).
Similar on the host side, we only enforce the limit set by the cgroup but
not guarantee it.

> I was thinking of a case where qemu is using a single PASID to setup
> the guest kVA or similar
> 
got it.

> Jason

Thanks,

Jacob

Re: [RFC PATCH 15/18] cgroup: Introduce ioasids controller

2021-03-04 Thread Jacob Pan

Hi Jason,

On Thu, 4 Mar 2021 13:54:02 -0400, Jason Gunthorpe  wrote:

> On Thu, Mar 04, 2021 at 09:46:03AM -0800, Jacob Pan wrote:
> 
> > Right, I was assuming have three use cases of IOASIDs:
> > 1. host supervisor SVA (not a concern, just one init_mm to bind)
> > 2. host user SVA, either one IOASID per process or perhaps some private
> > IOASID for private address space
> > 3. VM use for guest SVA, each IOASID is bound to a guest process
> > 
> > My current cgroup proposal applies to #3 with IOASID_SET_TYPE_MM, which
> > is allocated by the new /dev/ioasid interface.
> > 
> > For #2, I was thinking you can limit the host process via PIDs cgroup?
> > i.e. limit fork. So the host IOASIDs are currently allocated from the
> > system pool with quota of chosen by iommu_sva_init() in my patch, 0
> > means unlimited use whatever is available.
> > https://lkml.org/lkml/2021/2/28/18  
> 
> Why do we need two pools?
> 
> If PASID's are limited then why does it matter how the PASID was
> allocated? Either the thing requesting it is below the limit, or it
> isn't.
> 
you are right. it should be tracked based on the process regardless it is
allocated by the user (/dev/ioasid) or indirectly by kernel drivers during
iommu_sva_bind_device(). Need to consolidate both 2 and 3 and
decouple cgroup and IOASID set.

> For something like qemu I'd expect to put the qemu process in a cgroup
> with 1 PASID. Who cares what qemu uses the PASID for, or how it was
> allocated?
> 
For vSVA, we will need one PASID per guest process. But that is up to the
admin based on whether or how many SVA capable devices are directly
assigned.

> Jason


Thanks,

Jacob

Re: [RFC PATCH 15/18] cgroup: Introduce ioasids controller

2021-03-04 Thread Jacob Pan

Hi Jean-Philippe,

On Thu, 4 Mar 2021 10:49:37 +0100, Jean-Philippe Brucker
 wrote:

> On Wed, Mar 03, 2021 at 04:02:05PM -0800, Jacob Pan wrote:
> > Hi Jacob,
> > 
> > On Wed, 3 Mar 2021 13:17:26 -0800, Jacob Pan
> >  wrote:
> >   
> > > Hi Tejun,
> > > 
> > > On Wed, 3 Mar 2021 10:44:28 -0500, Tejun Heo  wrote:
> > >   
> > > > On Sat, Feb 27, 2021 at 02:01:23PM -0800, Jacob Pan wrote:
> > > > > IOASIDs are used to associate DMA requests with virtual address
> > > > > spaces. They are a system-wide limited resource made available to
> > > > > the userspace applications. Let it be VMs or user-space device
> > > > > drivers.
> > > > > 
> > > > > This RFC patch introduces a cgroup controller to address the
> > > > > following problems:
> > > > > 1. Some user applications exhaust all the available IOASIDs thus
> > > > > depriving others of the same host.
> > > > > 2. System admins need to provision VMs based on their needs for
> > > > > IOASIDs, e.g. the number of VMs with assigned devices that perform
> > > > > DMA requests with PASID.  
> > > > 
> > > > Please take a look at the proposed misc controller:
> > > > 
> > > >  http://lkml.kernel.org/r/20210302081705.1990283-2-vipi...@google.com
> > > > 
> > > > Would that fit your bill?
> > > The interface definitely can be reused. But IOASID has a different
> > > behavior in terms of migration and ownership checking. I guess SEV key
> > > IDs are not tied to a process whereas IOASIDs are. Perhaps this can be
> > > solved by adding
> > > + .can_attach = ioasids_can_attach,
> > > + .cancel_attach  = ioasids_cancel_attach,
> > > Let me give it a try and come back.
> > >   
> > While I am trying to fit the IOASIDs cgroup in to the misc cgroup
> > proposal. I'd like to have a direction check on whether this idea of
> > using cgroup for IOASID/PASID resource management is viable.  
> 
> Yes, even for host SVA it would be good to have a cgroup. Currently the
> number of shared address spaces is naturally limited by number of
> processes, which can be controlled with rlimit and cgroup. But on Arm the
> hardware limit on shared address spaces is 64k (number of ASIDs), easily
> exhausted with the default PASID and PID limits. So a cgroup for managing
> this resource is more than welcome.
> 
> It looks like your current implementation is very dependent on
> IOASID_SET_TYPE_MM?  I'll need to do more reading about cgroup to see how
> easily it can be adapted to host SVA which uses IOASID_SET_TYPE_NULL.
> 
Right, I was assuming have three use cases of IOASIDs:
1. host supervisor SVA (not a concern, just one init_mm to bind)
2. host user SVA, either one IOASID per process or perhaps some private
IOASID for private address space
3. VM use for guest SVA, each IOASID is bound to a guest process

My current cgroup proposal applies to #3 with IOASID_SET_TYPE_MM, which is
allocated by the new /dev/ioasid interface.

For #2, I was thinking you can limit the host process via PIDs cgroup? i.e.
limit fork. So the host IOASIDs are currently allocated from the system pool
with quota of chosen by iommu_sva_init() in my patch, 0 means unlimited use
whatever is available. https://lkml.org/lkml/2021/2/28/18


> Thanks,
> Jean


Thanks,

Jacob

Re: [RFC v2 2/2] cgroup: sev: Miscellaneous cgroup documentation.

2021-03-03 Thread Jacob Pan

Hi Vipin,

On Tue,  2 Mar 2021 00:17:05 -0800, Vipin Sharma  wrote:

> +Migration and Ownership
> +~~~
> +
> +A miscellaneous scalar resource is charged to the cgroup in which it is
> used +first, and stays charged to that cgroup until that resource is
> freed. Migrating +a process to a different cgroup does not move the
> charge to the destination +cgroup where the process has moved.
> +
I am trying to see if IOASIDs cgroup can also fit in this misc controller
as yet another resource type.
https://lore.kernel.org/linux-iommu/20210303131726.7a8cb169@jacob-builder/T/#u
However, unlike sev IOASIDs need to be migrated if the process is moved to
another cgroup. i.e. charge the destination and uncharge the source.

Do you think this behavior can be achieved by differentiating resource
types? i.e. add attach callbacks for certain types. Having a single misc
interface seems cleaner than creating another controller.

Thanks,

Jacob

Re: [RFC PATCH 15/18] cgroup: Introduce ioasids controller

2021-03-03 Thread Jacob Pan

Hi Jacob,

On Wed, 3 Mar 2021 13:17:26 -0800, Jacob Pan
 wrote:

> Hi Tejun,
> 
> On Wed, 3 Mar 2021 10:44:28 -0500, Tejun Heo  wrote:
> 
> > On Sat, Feb 27, 2021 at 02:01:23PM -0800, Jacob Pan wrote:  
> > > IOASIDs are used to associate DMA requests with virtual address
> > > spaces. They are a system-wide limited resource made available to the
> > > userspace applications. Let it be VMs or user-space device drivers.
> > > 
> > > This RFC patch introduces a cgroup controller to address the following
> > > problems:
> > > 1. Some user applications exhaust all the available IOASIDs thus
> > > depriving others of the same host.
> > > 2. System admins need to provision VMs based on their needs for
> > > IOASIDs, e.g. the number of VMs with assigned devices that perform
> > > DMA requests with PASID.
> > 
> > Please take a look at the proposed misc controller:
> > 
> >  http://lkml.kernel.org/r/20210302081705.1990283-2-vipi...@google.com
> > 
> > Would that fit your bill?  
> The interface definitely can be reused. But IOASID has a different
> behavior in terms of migration and ownership checking. I guess SEV key
> IDs are not tied to a process whereas IOASIDs are. Perhaps this can be
> solved by adding
> + .can_attach = ioasids_can_attach,
> + .cancel_attach  = ioasids_cancel_attach,
> Let me give it a try and come back.
> 
While I am trying to fit the IOASIDs cgroup in to the misc cgroup proposal.
I'd like to have a direction check on whether this idea of using cgroup for
IOASID/PASID resource management is viable.

Alex/Jason/Jean and everyone, your feedback is much appreciated.

> Thanks for the pointer.
> 
> Jacob
> 
> > 
> > Thanks.
> >   
> 
> 
> Thanks,
> 
> Jacob


Thanks,

Jacob

Re: [RFC PATCH 15/18] cgroup: Introduce ioasids controller

2021-03-03 Thread Jacob Pan

Hi Tejun,

On Wed, 3 Mar 2021 10:44:28 -0500, Tejun Heo  wrote:

> On Sat, Feb 27, 2021 at 02:01:23PM -0800, Jacob Pan wrote:
> > IOASIDs are used to associate DMA requests with virtual address spaces.
> > They are a system-wide limited resource made available to the userspace
> > applications. Let it be VMs or user-space device drivers.
> > 
> > This RFC patch introduces a cgroup controller to address the following
> > problems:
> > 1. Some user applications exhaust all the available IOASIDs thus
> > depriving others of the same host.
> > 2. System admins need to provision VMs based on their needs for IOASIDs,
> > e.g. the number of VMs with assigned devices that perform DMA requests
> > with PASID.  
> 
> Please take a look at the proposed misc controller:
> 
>  http://lkml.kernel.org/r/20210302081705.1990283-2-vipi...@google.com
> 
> Would that fit your bill?
The interface definitely can be reused. But IOASID has a different behavior
in terms of migration and ownership checking. I guess SEV key IDs are not
tied to a process whereas IOASIDs are. Perhaps this can be solved by
adding
+   .can_attach = ioasids_can_attach,
+   .cancel_attach  = ioasids_cancel_attach,
Let me give it a try and come back.

Thanks for the pointer.

Jacob

> 
> Thanks.
> 


Thanks,

Jacob

Re: [PATCH RFC v1 12/15] iommu/virtio: Add support for INVALIDATE request

2021-03-03 Thread Jacob Pan

Hi Vivek,

On Fri, 15 Jan 2021 17:43:39 +0530, Vivek Gautam 
wrote:

> From: Jean-Philippe Brucker 
> 
> Add support for tlb invalidation ops that can send invalidation
> requests to back-end virtio-iommu when stage-1 page tables are
> supported.
> 
Just curious if it possible to reuse the iommu uapi for invalidation and others.
When we started out designing the iommu uapi, the intention was to support
both emulated and virtio iommu.

> Signed-off-by: Jean-Philippe Brucker 
> [Vivek: Refactoring the iommu_flush_ops, and adding only one pasid sync
> op that's needed with current iommu-pasid-table infrastructure.
>   Also updating uapi defines as required by latest changes]
> Signed-off-by: Vivek Gautam 
> Cc: Joerg Roedel 
> Cc: Will Deacon 
> Cc: Michael S. Tsirkin 
> Cc: Robin Murphy 
> Cc: Jean-Philippe Brucker 
> Cc: Eric Auger 
> Cc: Alex Williamson 
> Cc: Kevin Tian 
> Cc: Jacob Pan 
> Cc: Liu Yi L 
> Cc: Lorenzo Pieralisi 
> Cc: Shameerali Kolothum Thodi 
> ---
>  drivers/iommu/virtio-iommu.c | 95 
>  1 file changed, 95 insertions(+)
> 
> diff --git a/drivers/iommu/virtio-iommu.c b/drivers/iommu/virtio-iommu.c
> index ae5dfd3f8269..004ea94e3731 100644
> --- a/drivers/iommu/virtio-iommu.c
> +++ b/drivers/iommu/virtio-iommu.c
> @@ -13,6 +13,7 @@
>  #include 
>  #include 
>  #include 
> +#include 
>  #include 
>  #include 
>  #include 
> @@ -63,6 +64,8 @@ struct viommu_mapping {
>  };
>  
>  struct viommu_mm {
> + int pasid;
> + u64 archid;
>   struct io_pgtable_ops   *ops;
>   struct viommu_domain*domain;
>  };
> @@ -692,6 +695,98 @@ static void viommu_event_handler(struct virtqueue
> *vq) virtqueue_kick(vq);
>  }
>  
> +/* PASID and pgtable APIs */
> +
> +static void __viommu_flush_pasid_tlb_all(struct viommu_domain *vdomain,
> +  int pasid, u64 arch_id, int
> type) +{
> + struct virtio_iommu_req_invalidate req = {
> + .head.type  = VIRTIO_IOMMU_T_INVALIDATE,
> + .inv_gran   =
> cpu_to_le32(VIRTIO_IOMMU_INVAL_G_PASID),
> + .flags  =
> cpu_to_le32(VIRTIO_IOMMU_INVAL_F_PASID),
> + .inv_type   = cpu_to_le32(type),
> +
> + .domain = cpu_to_le32(vdomain->id),
> + .pasid  = cpu_to_le32(pasid),
> + .archid = cpu_to_le64(arch_id),
> + };
> +
> + if (viommu_send_req_sync(vdomain->viommu, , sizeof(req)))
> + pr_debug("could not send invalidate request\n");
> +}
> +
> +static void viommu_flush_tlb_add(struct iommu_iotlb_gather *gather,
> +  unsigned long iova, size_t granule,
> +  void *cookie)
> +{
> + struct viommu_mm *viommu_mm = cookie;
> + struct viommu_domain *vdomain = viommu_mm->domain;
> + struct iommu_domain *domain = >domain;
> +
> + iommu_iotlb_gather_add_page(domain, gather, iova, granule);
> +}
> +
> +static void viommu_flush_tlb_walk(unsigned long iova, size_t size,
> +   size_t granule, void *cookie)
> +{
> + struct viommu_mm *viommu_mm = cookie;
> + struct viommu_domain *vdomain = viommu_mm->domain;
> + struct virtio_iommu_req_invalidate req = {
> + .head.type  = VIRTIO_IOMMU_T_INVALIDATE,
> + .inv_gran   = cpu_to_le32(VIRTIO_IOMMU_INVAL_G_VA),
> + .inv_type   = cpu_to_le32(VIRTIO_IOMMU_INV_T_IOTLB),
> + .flags  =
> cpu_to_le32(VIRTIO_IOMMU_INVAL_F_ARCHID), +
> + .domain = cpu_to_le32(vdomain->id),
> + .pasid  = cpu_to_le32(viommu_mm->pasid),
> + .archid = cpu_to_le64(viommu_mm->archid),
> + .virt_start = cpu_to_le64(iova),
> + .nr_pages   = cpu_to_le64(size / granule),
> + .granule= ilog2(granule),
> + };
> +
> + if (viommu_add_req(vdomain->viommu, , sizeof(req)))
> + pr_debug("could not add invalidate request\n");
> +}
> +
> +static void viommu_flush_tlb_all(void *cookie)
> +{
> + struct viommu_mm *viommu_mm = cookie;
> +
> + if (!viommu_mm->archid)
> + return;
> +
> + __viommu_flush_pasid_tlb_all(viommu_mm->domain, viommu_mm->pasid,
> +  viommu_mm->archid,
> +  VIRTIO_IOMMU_INV_T_IOTLB);
> +}
> +
> +static struct iommu_flush_ops viommu_flus

[PATCH v2 1/4] iommu/vt-d: Enable write protect for supervisor SVM

2021-03-02 Thread Jacob Pan

Write protect bit, when set, inhibits supervisor writes to the read-only
pages. In supervisor shared virtual addressing (SVA), where page tables
are shared between CPU and DMA, IOMMU PASID entry WPE bit should match
CR0.WP bit in the CPU.
This patch sets WPE bit for supervisor PASIDs if CR0.WP is set.

Signed-off-by: Sanjay Kumar 
Signed-off-by: Jacob Pan 
---
 drivers/iommu/intel/pasid.c | 26 ++
 1 file changed, 26 insertions(+)

diff --git a/drivers/iommu/intel/pasid.c b/drivers/iommu/intel/pasid.c
index 0cceaabc3ce6..0b7e0e726ade 100644
--- a/drivers/iommu/intel/pasid.c
+++ b/drivers/iommu/intel/pasid.c
@@ -410,6 +410,15 @@ static inline void pasid_set_sre(struct pasid_entry *pe)
pasid_set_bits(>val[2], 1 << 0, 1);
 }
 
+/*
+ * Setup the WPE(Write Protect Enable) field (Bit 132) of a
+ * scalable mode PASID entry.
+ */
+static inline void pasid_set_wpe(struct pasid_entry *pe)
+{
+   pasid_set_bits(>val[2], 1 << 4, 1 << 4);
+}
+
 /*
  * Setup the P(Present) field (Bit 0) of a scalable mode PASID
  * entry.
@@ -553,6 +562,20 @@ static void pasid_flush_caches(struct intel_iommu *iommu,
}
 }
 
+static inline int pasid_enable_wpe(struct pasid_entry *pte)
+{
+   unsigned long cr0 = read_cr0();
+
+   /* CR0.WP is normally set but just to be sure */
+   if (unlikely(!(cr0 & X86_CR0_WP))) {
+   pr_err_ratelimited("No CPU write protect!\n");
+   return -EINVAL;
+   }
+   pasid_set_wpe(pte);
+
+   return 0;
+};
+
 /*
  * Set up the scalable mode pasid table entry for first only
  * translation type.
@@ -584,6 +607,9 @@ int intel_pasid_setup_first_level(struct intel_iommu *iommu,
return -EINVAL;
}
pasid_set_sre(pte);
+   if (pasid_enable_wpe(pte))
+   return -EINVAL;
+
}
 
if (flags & PASID_FLAG_FL5LP) {
-- 
2.25.1

[PATCH v2 0/4] Misc vSVA fixes for VT-d

2021-03-02 Thread Jacob Pan

Hi Baolu et al,

This is a collection of SVA-related fixes.

ChangeLog:

v2:
- For guest SVA, call pasid_set_wpe directly w/o checking host CR0.wp
  (Review comments by Kevin T.)
- Added fixes tag

Thanks,

Jacob

Jacob Pan (4):
  iommu/vt-d: Enable write protect for supervisor SVM
  iommu/vt-d: Enable write protect propagation from guest
  iommu/vt-d: Reject unsupported page request modes
  iommu/vt-d: Calculate and set flags for handle_mm_fault

 drivers/iommu/intel/pasid.c | 29 +
 drivers/iommu/intel/svm.c   | 21 +
 include/uapi/linux/iommu.h  |  3 ++-
 3 files changed, 48 insertions(+), 5 deletions(-)

-- 
2.25.1

[PATCH v2 4/4] iommu/vt-d: Calculate and set flags for handle_mm_fault

2021-03-02 Thread Jacob Pan

Page requests are originated from the user page fault. Therefore, we
shall set FAULT_FLAG_USER. 

FAULT_FLAG_REMOTE indicates that we are walking an mm which is not
guaranteed to be the same as the current->mm and should not be subject
to protection key enforcement. Therefore, we should set FAULT_FLAG_REMOTE
to avoid faults when both SVM and PKEY are used.

References: commit 1b2ee1266ea6 ("mm/core: Do not enforce PKEY permissions on 
remote mm access")
Reviewed-by: Raj Ashok 
Acked-by: Lu Baolu 
Signed-off-by: Jacob Pan 
---
 drivers/iommu/intel/svm.c | 9 ++---
 1 file changed, 6 insertions(+), 3 deletions(-)

diff --git a/drivers/iommu/intel/svm.c b/drivers/iommu/intel/svm.c
index ff7ae7cc17d5..7bfd20a24a60 100644
--- a/drivers/iommu/intel/svm.c
+++ b/drivers/iommu/intel/svm.c
@@ -1086,6 +1086,7 @@ static irqreturn_t prq_event_thread(int irq, void *d)
struct intel_iommu *iommu = d;
struct intel_svm *svm = NULL;
int head, tail, handled = 0;
+   unsigned int flags = 0;
 
/* Clear PPR bit before reading head/tail registers, to
 * ensure that we get a new interrupt if needed. */
@@ -1186,9 +1187,11 @@ static irqreturn_t prq_event_thread(int irq, void *d)
if (access_error(vma, req))
goto invalid;
 
-   ret = handle_mm_fault(vma, address,
- req->wr_req ? FAULT_FLAG_WRITE : 0,
- NULL);
+   flags = FAULT_FLAG_USER | FAULT_FLAG_REMOTE;
+   if (req->wr_req)
+   flags |= FAULT_FLAG_WRITE;
+
+   ret = handle_mm_fault(vma, address, flags, NULL);
if (ret & VM_FAULT_ERROR)
goto invalid;
 
-- 
2.25.1

[PATCH v2 3/4] iommu/vt-d: Reject unsupported page request modes

2021-03-02 Thread Jacob Pan

When supervisor/privilige mode SVM is used, we bind init_mm.pgd with
a supervisor PASID. There should not be any page fault for init_mm.
Execution request with DMA read is also not supported.

This patch checks PRQ descriptor for both unsupported configurations,
reject them both with invalid responses.

Fixes: 1c4f88b7f1f92 ("iommu/vt-d: Shared virtual address in scalable
mode")
Acked-by: Lu Baolu 
Signed-off-by: Jacob Pan 
---
 drivers/iommu/intel/svm.c | 12 +++-
 1 file changed, 11 insertions(+), 1 deletion(-)

diff --git a/drivers/iommu/intel/svm.c b/drivers/iommu/intel/svm.c
index 23a1e4f58c54..ff7ae7cc17d5 100644
--- a/drivers/iommu/intel/svm.c
+++ b/drivers/iommu/intel/svm.c
@@ -1113,7 +1113,17 @@ static irqreturn_t prq_event_thread(int irq, void *d)
   ((unsigned long long *)req)[1]);
goto no_pasid;
}
-
+   /* We shall not receive page request for supervisor SVM */
+   if (req->pm_req && (req->rd_req | req->wr_req)) {
+   pr_err("Unexpected page request in Privilege Mode");
+   /* No need to find the matching sdev as for bad_req */
+   goto no_pasid;
+   }
+   /* DMA read with exec requeset is not supported. */
+   if (req->exe_req && req->rd_req) {
+   pr_err("Execution request not supported\n");
+   goto no_pasid;
+   }
if (!svm || svm->pasid != req->pasid) {
rcu_read_lock();
svm = ioasid_find(NULL, req->pasid, NULL);
-- 
2.25.1

[PATCH v2 2/4] iommu/vt-d: Enable write protect propagation from guest

2021-03-02 Thread Jacob Pan

Write protect bit, when set, inhibits supervisor writes to the read-only
pages. In guest supervisor shared virtual addressing (SVA), write-protect
should be honored upon guest bind supervisor PASID request.

This patch extends the VT-d portion of the IOMMU UAPI to include WP bit.
WPE bit of the  supervisor PASID entry will be set to match CPU CR0.WP bit.

Signed-off-by: Sanjay Kumar 
Signed-off-by: Jacob Pan 
---
 drivers/iommu/intel/pasid.c | 3 +++
 include/uapi/linux/iommu.h  | 3 ++-
 2 files changed, 5 insertions(+), 1 deletion(-)

diff --git a/drivers/iommu/intel/pasid.c b/drivers/iommu/intel/pasid.c
index 0b7e0e726ade..b7e39239f539 100644
--- a/drivers/iommu/intel/pasid.c
+++ b/drivers/iommu/intel/pasid.c
@@ -763,6 +763,9 @@ intel_pasid_setup_bind_data(struct intel_iommu *iommu, 
struct pasid_entry *pte,
return -EINVAL;
}
pasid_set_sre(pte);
+   /* Enable write protect WP if guest requested */
+   if (pasid_data->flags & IOMMU_SVA_VTD_GPASID_WPE)
+   pasid_set_wpe(pte);
}
 
if (pasid_data->flags & IOMMU_SVA_VTD_GPASID_EAFE) {
diff --git a/include/uapi/linux/iommu.h b/include/uapi/linux/iommu.h
index 35d48843acd8..3a9164cc9937 100644
--- a/include/uapi/linux/iommu.h
+++ b/include/uapi/linux/iommu.h
@@ -288,7 +288,8 @@ struct iommu_gpasid_bind_data_vtd {
 #define IOMMU_SVA_VTD_GPASID_PWT   (1 << 3) /* page-level write through */
 #define IOMMU_SVA_VTD_GPASID_EMTE  (1 << 4) /* extended mem type enable */
 #define IOMMU_SVA_VTD_GPASID_CD(1 << 5) /* PASID-level cache 
disable */
-#define IOMMU_SVA_VTD_GPASID_LAST  (1 << 6)
+#define IOMMU_SVA_VTD_GPASID_WPE   (1 << 6) /* Write protect enable */
+#define IOMMU_SVA_VTD_GPASID_LAST  (1 << 7)
__u64 flags;
__u32 pat;
__u32 emt;
-- 
2.25.1

Re: [PATCH v6 08/12] fork: Clear PASID for new mm

2021-03-01 Thread Jacob Pan

Hi Fenghua,

On Thu, 25 Feb 2021 22:17:11 +, Fenghua Yu  wrote:

> Hi, Jean,
> 
> On Wed, Feb 24, 2021 at 11:19:27AM +0100, Jean-Philippe Brucker wrote:
> > Hi Fenghua,
> > 
> > [Trimmed the Cc list]
> > 
> > On Mon, Jul 13, 2020 at 04:48:03PM -0700, Fenghua Yu wrote:  
> > > When a new mm is created, its PASID should be cleared, i.e. the PASID
> > > is initialized to its init state 0 on both ARM and X86.  
> > 
> > I just noticed this patch was dropped in v7, and am wondering whether we
> > could still upstream it. Does x86 need a child with a new address space
> > (!CLONE_VM) to inherit the PASID of the parent?  That doesn't make much
> > sense with regard to IOMMU structures - same PASID indexing multiple
> > PGDs?  
> 
> You are right: x86 should clear mm->pasid when a new mm is created.
> This patch somehow is losted:(
> 
> > 
> > Currently iommu_sva_alloc_pasid() assumes mm->pasid is always
> > initialized to 0 and fails on forked tasks. I'm trying to figure out
> > how to fix this. Could we clear the pasid on fork or does it break the
> > x86 model?  
> 
> x86 calls ioasid_alloc() instead of iommu_sva_alloc_pasid(). So
We should consolidate at some point, there is no need to store pasid in two
places.

> functionality is not a problem without this patch on x86. But I think
I feel the reason that x86 doesn't care is that mm->pasid is not used
unless bind_mm is called. For the fork children even mm->pasid is non-zero,
it has no effect since it is not loaded onto MSRs.
Perhaps you could also add a check or WARN_ON(!mm->pasid) in load_pasid()?

> we do need to have this patch in the kernel because PASID is per addr
> space and two addr spaces shouldn't have the same PASID.
> 
Agreed.

> Who will accept this patch?
> 
> Thanks.
> 
> -Fenghua


Thanks,

Jacob

[RFC PATCH 17/18] docs: cgroup-v1: Add IOASIDs controller

2021-02-27 Thread Jacob Pan

Signed-off-by: Jacob Pan 
---
 Documentation/admin-guide/cgroup-v1/index.rst |   1 +
 .../admin-guide/cgroup-v1/ioasids.rst | 110 ++
 2 files changed, 111 insertions(+)
 create mode 100644 Documentation/admin-guide/cgroup-v1/ioasids.rst

diff --git a/Documentation/admin-guide/cgroup-v1/index.rst 
b/Documentation/admin-guide/cgroup-v1/index.rst
index 226f64473e8e..f5e307dc4dbb 100644
--- a/Documentation/admin-guide/cgroup-v1/index.rst
+++ b/Documentation/admin-guide/cgroup-v1/index.rst
@@ -15,6 +15,7 @@ Control Groups version 1
 devices
 freezer-subsystem
 hugetlb
+ioasids
 memcg_test
 memory
 net_cls
diff --git a/Documentation/admin-guide/cgroup-v1/ioasids.rst 
b/Documentation/admin-guide/cgroup-v1/ioasids.rst
new file mode 100644
index ..b30eb41bf1be
--- /dev/null
+++ b/Documentation/admin-guide/cgroup-v1/ioasids.rst
@@ -0,0 +1,110 @@
+
+I/O Address Space ID (IOASID) Controller
+
+
+Acronyms
+
+PASID:
+   Process Address Space ID, defined by PCIe
+SVA:
+   Shared Virtual Address
+
+Introduction
+
+
+IOASIDs are used to associate DMA requests with virtual address spaces. As
+a system-wide limited¹ resource, its constraints are managed by the IOASIDs
+cgroup subsystem. The specific use cases are:
+
+1. Some user applications exhaust all the available IOASIDs thus depriving
+   others of the same host.
+
+2. System admins need to provision VMs based on their needs for IOASIDs,
+   e.g. the number of VMs with assigned devices that perform DMA requests
+   with PASID.
+
+The IOASID subsystem consists of three components:
+
+- IOASID core: provides APIs for allocation, pool management,
+  notifications and refcounting. See Documentation/driver-api/ioasid.rst
+  for details
+- IOASID user:  provides user allocation interface via /dev/ioasid
+- IOASID cgroup controller: manage resource distribution
+
+Resource Distribution Model
+---
+IOASID allocation is process-based in that IOASIDs are tied to page tables²,
+the threaded model is not supported. The allocation is rejected by the
+cgroup hierarchy once a limit is reached. However, organizational changes
+such as moving processes across cgroups are exempted. Therefore, it is
+possible to have ioasids.current > ioasids.max. It is not possible to do
+further allocation after the organizational change that exceeds the max.
+
+The system capacity of the IOASIDs is default to PCIe PASID size of 20 bits.
+IOASID core provides API to adjust the system capacity based on platforms.
+IOASIDs are used by both user applications (e.g. VMs and userspace drivers)
+and kernel (e.g. supervisor SVA). However, only user allocation is subject
+to cgroup constraints. Host kernel allocates a pool of IOASIDs where its
+quota is subtracted from the system capacity. IOASIDs cgroup consults with
+the IOASID core for available capacity when a new cgroup limit is granted.
+Upon creation, no IOASID allocation is allowed by the user processes within
+the new cgroup.
+
+Usage
+-
+CGroup filesystem has the following IOASIDs controller specific entries:
+::
+
+ ioasids.current
+ ioasids.events
+ ioasids.max
+
+To use the IOASIDs controller, set ioasids.max to the limit of the number
+of IOASIDs that can be allocated. The file ioasids.current shows the current
+number of IOASIDs allocated within the cgroup.
+
+Example
+
+1. Mount the cgroup2 FS ::
+
+   $ mount -t cgroup2 none /mnt/cg2/
+
+2. Add ioasids controller ::
+
+   $ echo '+ioasids' > /mnt/cg2/cgroup.subtree_control
+
+3. Create a hierarchy, set non-zero limit (default 0) ::
+
+   $ mkdir /mnt/cg2/test1
+   $ echo 5 > /mnt/cg2/test1/ioasids.max
+
+4. Allocate IOASIDs within limit should succeed ::
+
+   $echo $$ > /mnt/cg2/test1/cgroup.procs
+   Do IOASID allocation via /dev/ioasid
+   ioasids.current:1
+   ioasids.max:5
+
+5. Attempt to allocate IOASIDs beyond limit should fail ::
+
+   ioasids.current:5
+   ioasids.max:5
+
+6. Attach a new process with IOASID already allocated to a cgroup could
+result in ioasids.current > ioasids.max, e.g. process with PID 1234 under
+a cgroup with IOASIDs controller has one IOASID allocated, moving it to
+test1 cgroup ::
+
+   $echo 1234 > /mnt/cg2/test1/cgroup.procs
+   ioasids.current:6
+   ioasids.max:5
+
+Notes
+-
+¹ When IOASID is used for PCI Express PASID, the range is limited to the
+PASID size of 20 bits. For a device that its resources can be shared across
+the platform, the IOASID namespace must be system-wide in order to uniquely
+identify DMA request with PASID inside the device.
+
+² The primary use case is SVA, where CPU page tables are shared with DMA via
+IOMMU.
-- 
2.25.1

[RFC PATCH 18/18] ioasid: Add /dev/ioasid for userspace

2021-02-27 Thread Jacob Pan

From: Liu Yi L 

I/O Address Space IDs (IOASIDs) is used to tag DMA requests to target
multiple DMA address spaces for physical devices. Its PCI terminology
is called PASID (Process Address Space ID). Platforms with PASID support
can provide PASID granularity DMA isolation, which is very useful for
efficient and secure device sharing (SVA, subdevice passthrough, etc.).

Today only kernel drivers are allowed to allocate IOASIDs [1]. This patch
aims to extend this capability to userspace as required in device pass-
through scenarios. For example, a userspace driver may want to create its
own DMA address spaces besides the default IOVA address space established
by the kernel on the assigned device (e.g. vDPA control vq [2] and guest
SVA [3]), thus need to get IOASIDs from the kernel IOASID allocator for
tagging. In concept, each device can have its own IOASID space, thus it's
also possible for userspace driver to manage a private IOASID space itself,
say, when PF/VF is assigned. However it doesn't work for subdevice pass-
through, as multiple subdevices under the same parent device share a single
IOASID space thus IOASIDs must be centrally managed by the kernel in such
case.

This patch introduces a /dev/ioasid interface for this purpose (per discussion
in [4]). An IOASID is just a number before it is tagged to a specific DMA
address space. The actual IOASID tagging (to DMA requests) and association
(with DMA address spaces) operations from userspace are scrutinized by specific
device passthrough frameworks, which must ensure that a malicious driver
cannot program arbitrary IOASIDs to its assigned device to access DMA address
spaces that don't belong to it, this is out of the scope of this patch (a
reference VFIO implementation will be posted soon).

Open:

PCIe PASID is 20bit implying a space with 1M IOASIDs. although it's plenty
there was an open [4] on whether this user interface is open to all processes
or only selective processes (e.g. with device assigned). In this patchseries,
a cgroup controller is introduced to manage IOASID quota that a process is
allowed to use. A cgroup-enabled system may by default set quota=0 to disallow
IOASID allocation for most processes, and then having the virt management
stack to adjust the quota for a process which gets device assigned. But yeah,
we are also willing to hear more suggestions.

[1] 
https://lore.kernel.org/linux-iommu/156595-62508-8-git-send-email-jacob.jun@linux.intel.com/
[2] https://lore.kernel.org/kvm/20201216064818.48239-1-jasow...@redhat.com/
[3] 
https://lore.kernel.org/linux-iommu/1599734733-6431-1-git-send-email-yi.l@intel.com/
[4] https://lore.kernel.org/kvm/20201014171055.328a5...@w520.home/

Signed-off-by: Liu Yi L 
---
 Documentation/userspace-api/index.rst  |   1 +
 Documentation/userspace-api/ioasid.rst |  49 
 drivers/iommu/Kconfig  |   5 +
 drivers/iommu/Makefile |   1 +
 drivers/iommu/intel/Kconfig|   1 +
 drivers/iommu/ioasid_user.c| 297 +
 include/linux/ioasid.h |  26 +++
 include/linux/miscdevice.h |   1 +
 include/uapi/linux/ioasid.h|  98 
 9 files changed, 479 insertions(+)
 create mode 100644 Documentation/userspace-api/ioasid.rst
 create mode 100644 drivers/iommu/ioasid_user.c
 create mode 100644 include/uapi/linux/ioasid.h

diff --git a/Documentation/userspace-api/index.rst 
b/Documentation/userspace-api/index.rst
index acd2cc2a538d..69e1be7c67ee 100644
--- a/Documentation/userspace-api/index.rst
+++ b/Documentation/userspace-api/index.rst
@@ -24,6 +24,7 @@ place where this information is gathered.
ioctl/index
iommu
media/index
+   ioasid
 
 .. only::  subproject and html
 
diff --git a/Documentation/userspace-api/ioasid.rst 
b/Documentation/userspace-api/ioasid.rst
new file mode 100644
index ..879d6cbae858
--- /dev/null
+++ b/Documentation/userspace-api/ioasid.rst
@@ -0,0 +1,49 @@
+.. SPDX-License-Identifier: GPL-2.0
+.. ioasid:
+
+=
+IOASID Userspace API
+=
+
+The IOASID UAPI is used for userspace IOASID allocation/free requests,
+thus IOASID management is centralized in the IOASID core[1] in the kernel. The
+primary use case is guest Shared Virtual Address (SVA) today.
+
+Requests such as allocation/free can be issued by the users and managed
+on a per-process basis through the ioasid core. Upon opening ("/dev/ioasid"),
+a process obtains a unique handle associated with the process's mm_struct.
+This handle is mapped to an FD in the userspace. Only a single open is
+allowed per process.
+
+File descriptors can be transferred across processes by employing fork() or
+UNIX domain socket. FDs obtained by transfer cannot be used to perform
+IOASID requests. The following behaviors are recommended for the
+applications:
+
+ - forked children close the parent's IOASID FDs immediately, open new
+

[RFC PATCH 16/18] iommu/ioasid: Consult IOASIDs cgroup for allocation

2021-02-27 Thread Jacob Pan

Once IOASIDs cgroup is active, we must consult the limitation set up
by the cgroups during allocation. Freeing IOASIDs also need to return
the quota back to the cgroup.

Signed-off-by: Jacob Pan 
---
 drivers/iommu/ioasid.c | 7 ++-
 1 file changed, 6 insertions(+), 1 deletion(-)

diff --git a/drivers/iommu/ioasid.c b/drivers/iommu/ioasid.c
index d42b39ca2c8b..fd3f5729c71d 100644
--- a/drivers/iommu/ioasid.c
+++ b/drivers/iommu/ioasid.c
@@ -782,7 +782,10 @@ ioasid_t ioasid_alloc(struct ioasid_set *set, ioasid_t 
min, ioasid_t max,
 
spin_lock(_allocator_lock);
/* Check if the IOASID set has been allocated and initialized */
-   if (!ioasid_set_is_valid(set))
+   if (!set || !ioasid_set_is_valid(set))
+   goto done_unlock;
+
+   if (set->type == IOASID_SET_TYPE_MM && ioasid_cg_charge(set))
goto done_unlock;
 
if (set->quota <= atomic_read(>nr_ioasids)) {
@@ -832,6 +835,7 @@ ioasid_t ioasid_alloc(struct ioasid_set *set, ioasid_t min, 
ioasid_t max,
goto done_unlock;
 exit_free:
kfree(data);
+   ioasid_cg_uncharge(set);
 done_unlock:
spin_unlock(_allocator_lock);
return id;
@@ -849,6 +853,7 @@ static void ioasid_do_free_locked(struct ioasid_data *data)
kfree_rcu(ioasid_data, rcu);
}
atomic_dec(>set->nr_ioasids);
+   ioasid_cg_uncharge(data->set);
xa_erase(>set->xa, data->id);
/* Destroy the set if empty */
if (!atomic_read(>set->nr_ioasids))
-- 
2.25.1

[RFC PATCH 15/18] cgroup: Introduce ioasids controller

2021-02-27 Thread Jacob Pan

IOASIDs are used to associate DMA requests with virtual address spaces.
They are a system-wide limited resource made available to the userspace
applications. Let it be VMs or user-space device drivers.

This RFC patch introduces a cgroup controller to address the following
problems:
1. Some user applications exhaust all the available IOASIDs thus
depriving others of the same host.
2. System admins need to provision VMs based on their needs for IOASIDs,
e.g. the number of VMs with assigned devices that perform DMA requests
with PASID.

This patch is nowhere near its completion, it merely provides the basic
functionality for resource distribution and cgroup hierarchy
organizational changes.

Since this is part of a greater effort to enable Shared Virtual Address
(SVA) virtualization. We would like to have a direction check and
collect feedback early. For details, please refer to the documentation:
Documentation/admin-guide/cgroup-v1/ioasids.rst

Signed-off-by: Jacob Pan 
---
 include/linux/cgroup_subsys.h |   4 +
 include/linux/ioasid.h|  17 ++
 init/Kconfig  |   7 +
 kernel/cgroup/Makefile|   1 +
 kernel/cgroup/ioasids.c   | 345 ++
 5 files changed, 374 insertions(+)
 create mode 100644 kernel/cgroup/ioasids.c

diff --git a/include/linux/cgroup_subsys.h b/include/linux/cgroup_subsys.h
index acb77dcff3b4..cda75ecdcdcb 100644
--- a/include/linux/cgroup_subsys.h
+++ b/include/linux/cgroup_subsys.h
@@ -57,6 +57,10 @@ SUBSYS(hugetlb)
 SUBSYS(pids)
 #endif
 
+#if IS_ENABLED(CONFIG_CGROUP_IOASIDS)
+SUBSYS(ioasids)
+#endif
+
 #if IS_ENABLED(CONFIG_CGROUP_RDMA)
 SUBSYS(rdma)
 #endif
diff --git a/include/linux/ioasid.h b/include/linux/ioasid.h
index 4547086797df..5ea4710efb02 100644
--- a/include/linux/ioasid.h
+++ b/include/linux/ioasid.h
@@ -135,8 +135,25 @@ void ioasid_set_for_each_ioasid(struct ioasid_set *sdata,
void *data);
 int ioasid_register_notifier_mm(struct mm_struct *mm, struct notifier_block 
*nb);
 void ioasid_unregister_notifier_mm(struct mm_struct *mm, struct notifier_block 
*nb);
+#ifdef CONFIG_CGROUP_IOASIDS
+int ioasid_cg_charge(struct ioasid_set *set);
+void ioasid_cg_uncharge(struct ioasid_set *set);
+#else
+/* No cgroup control, allocation will proceed until run out total pool */
+static inline int ioasid_cg_charge(struct ioasid_set *set)
+{
+   return 0;
+}
+
+static inline int ioasid_cg_uncharge(struct ioasid_set *set)
+{
+   return 0;
+}
+#endif /* CGROUP_IOASIDS */
 bool ioasid_queue_work(struct work_struct *work);
+
 #else /* !CONFIG_IOASID */
+
 static inline void ioasid_install_capacity(ioasid_t total)
 {
 }
diff --git a/init/Kconfig b/init/Kconfig
index b77c60f8b963..9a23683dad98 100644
--- a/init/Kconfig
+++ b/init/Kconfig
@@ -1017,6 +1017,13 @@ config CGROUP_PIDS
  since the PIDs limit only affects a process's ability to fork, not to
  attach to a cgroup.
 
+config CGROUP_IOASIDS
+   bool "IOASIDs controller"
+   depends on IOASID
+   help
+ Provides enforcement of IO Address Space ID limits in the scope of a
+ cgroup.
+
 config CGROUP_RDMA
bool "RDMA controller"
help
diff --git a/kernel/cgroup/Makefile b/kernel/cgroup/Makefile
index 5d7a76bfbbb7..c5ad7c9a2305 100644
--- a/kernel/cgroup/Makefile
+++ b/kernel/cgroup/Makefile
@@ -3,6 +3,7 @@ obj-y := cgroup.o rstat.o namespace.o cgroup-v1.o freezer.o
 
 obj-$(CONFIG_CGROUP_FREEZER) += legacy_freezer.o
 obj-$(CONFIG_CGROUP_PIDS) += pids.o
+obj-$(CONFIG_CGROUP_IOASIDS) += ioasids.o
 obj-$(CONFIG_CGROUP_RDMA) += rdma.o
 obj-$(CONFIG_CPUSETS) += cpuset.o
 obj-$(CONFIG_CGROUP_DEBUG) += debug.o
diff --git a/kernel/cgroup/ioasids.c b/kernel/cgroup/ioasids.c
new file mode 100644
index ..ac43813da6ad
--- /dev/null
+++ b/kernel/cgroup/ioasids.c
@@ -0,0 +1,345 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * IO Address Space ID limiting controller for cgroups.
+ *
+ */
+#define pr_fmt(fmt)"ioasids_cg: " fmt
+
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+
+#define IOASIDS_MAX_STR "max"
+static DEFINE_MUTEX(ioasids_cg_lock);
+
+struct ioasids_cgroup {
+   struct cgroup_subsys_state  css;
+   atomic64_t  counter;
+   atomic64_t  limit;
+   struct cgroup_file  events_file;
+   /* Number of times allocations failed because limit was hit. */
+   atomic64_t  events_limit;
+};
+
+static struct ioasids_cgroup *css_ioasids(struct cgroup_subsys_state *css)
+{
+   return container_of(css, struct ioasids_cgroup, css);
+}
+
+static struct ioasids_cgroup *parent_ioasids(struct ioasids_cgroup *ioasids)
+{
+   return css_ioasids(ioasids->css.parent);
+}
+
+static struct cgroup_subsys_state *
+ioasids_css_alloc(struct cgroup_subsys_state *parent)
+{
+   struct ioasids_cgroup *ioasids

[PATCH V4 12/18] iommu/vt-d: Remove mm reference for guest SVA

2021-02-27 Thread Jacob Pan

Now that IOASID core keeps track of the IOASID to mm_struct ownership in
the forms of ioasid_set with IOASID_SET_TYPE_MM token type, there is no
need to keep the same mapping in VT-d driver specific data. Native SVM
usage is not affected by the change.

Signed-off-by: Jacob Pan 
---
 drivers/iommu/intel/svm.c | 7 ---
 1 file changed, 7 deletions(-)

diff --git a/drivers/iommu/intel/svm.c b/drivers/iommu/intel/svm.c
index c469c24d23f5..f75699ddb923 100644
--- a/drivers/iommu/intel/svm.c
+++ b/drivers/iommu/intel/svm.c
@@ -363,12 +363,6 @@ int intel_svm_bind_gpasid(struct iommu_domain *domain, 
struct device *dev,
ret = -ENOMEM;
goto out;
}
-   /* REVISIT: upper layer/VFIO can track host process that bind
-* the PASID. ioasid_set = mm might be sufficient for vfio to
-* check pasid VMM ownership. We can drop the following line
-* once VFIO and IOASID set check is in place.
-*/
-   svm->mm = get_task_mm(current);
svm->pasid = data->hpasid;
if (data->flags & IOMMU_SVA_GPASID_VAL) {
svm->gpasid = data->gpasid;
@@ -376,7 +370,6 @@ int intel_svm_bind_gpasid(struct iommu_domain *domain, 
struct device *dev,
}
ioasid_attach_data(data->hpasid, svm);
INIT_LIST_HEAD_RCU(>devs);
-   mmput(svm->mm);
}
sdev = kzalloc(sizeof(*sdev), GFP_KERNEL);
if (!sdev) {
-- 
2.25.1

[PATCH V4 14/18] iommu/vt-d: Listen to IOASID notifications

2021-02-27 Thread Jacob Pan

On Intel Scalable I/O Virtualization (SIOV) enabled platforms, IOMMU
driver is one of the users of IOASIDs. In normal flow, callers will
perform IOASID allocation, bind, unbind, and free in order. However, for
guest SVA, IOASID free could come before unbind as guest is untrusted.
This patch registers IOASID notification handler such that IOMMU driver
can perform PASID teardown upon receiving an unexpected IOASID free
event.

Signed-off-by: Jacob Pan 
---
 drivers/iommu/intel/iommu.c |   2 +
 drivers/iommu/intel/svm.c   | 109 +++-
 include/linux/intel-iommu.h |   2 +
 3 files changed, 111 insertions(+), 2 deletions(-)

diff --git a/drivers/iommu/intel/iommu.c b/drivers/iommu/intel/iommu.c
index eb9868061545..d602e89c40d2 100644
--- a/drivers/iommu/intel/iommu.c
+++ b/drivers/iommu/intel/iommu.c
@@ -3313,6 +3313,8 @@ static int __init init_dmars(void)
pr_err("Failed to allocate host PASID set %lu\n",
PTR_ERR(host_pasid_set));
intel_iommu_sm = 0;
+   } else {
+   intel_svm_add_pasid_notifier();
}
}
 
diff --git a/drivers/iommu/intel/svm.c b/drivers/iommu/intel/svm.c
index f75699ddb923..b5bb9b578281 100644
--- a/drivers/iommu/intel/svm.c
+++ b/drivers/iommu/intel/svm.c
@@ -96,6 +96,104 @@ static inline bool intel_svm_capable(struct intel_iommu 
*iommu)
return iommu->flags & VTD_FLAG_SVM_CAPABLE;
 }
 
+static inline void intel_svm_drop_pasid(ioasid_t pasid)
+{
+   /*
+* Detaching SPID results in UNBIND notification on the set, we must
+* do this before dropping the IOASID reference, otherwise the
+* notification chain may get destroyed.
+*/
+   ioasid_detach_spid(pasid);
+   ioasid_detach_data(pasid);
+   ioasid_put(NULL, pasid);
+}
+
+static DEFINE_MUTEX(pasid_mutex);
+#define pasid_lock_held() lock_is_held(_mutex.dep_map)
+
+static void intel_svm_free_async_fn(struct work_struct *work)
+{
+   struct intel_svm *svm = container_of(work, struct intel_svm, work);
+   struct intel_svm_dev *sdev;
+
+   /*
+* Unbind all devices associated with this PASID which is
+* being freed by other users such as VFIO.
+*/
+   mutex_lock(_mutex);
+   list_for_each_entry_rcu(sdev, >devs, list, pasid_lock_held()) {
+   /* Does not poison forward pointer */
+   list_del_rcu(>list);
+   spin_lock(>iommu->lock);
+   intel_pasid_tear_down_entry(sdev->iommu, sdev->dev,
+   svm->pasid, true);
+   intel_svm_drain_prq(sdev->dev, svm->pasid);
+   spin_unlock(>iommu->lock);
+   kfree_rcu(sdev, rcu);
+   }
+   /*
+* We may not be the last user to drop the reference but since
+* the PASID is in FREE_PENDING state, no one can get new reference.
+* Therefore, we can safely free the private data svm.
+*/
+   intel_svm_drop_pasid(svm->pasid);
+
+   /*
+* Free before unbind can only happen with host PASIDs used for
+* guest SVM. We get here because ioasid_free is called with
+* outstanding references. So we need to drop the reference
+* such that the PASID can be reclaimed. unbind_gpasid() after this
+* will not result in dropping refcount since the private data is
+* already detached.
+*/
+   kfree(svm);
+
+   mutex_unlock(_mutex);
+}
+
+
+static int pasid_status_change(struct notifier_block *nb,
+   unsigned long code, void *data)
+{
+   struct ioasid_nb_args *args = (struct ioasid_nb_args *)data;
+   struct intel_svm *svm = (struct intel_svm *)args->pdata;
+   int ret = NOTIFY_DONE;
+
+   /*
+* Notification private data is a choice of vendor driver when the
+* IOASID is allocated or attached after allocation. When the data
+* type changes, we must make modifications here accordingly.
+*/
+   if (code == IOASID_NOTIFY_FREE) {
+   /*
+* If PASID UNBIND happens before FREE, private data of the
+* IOASID should be NULL, then we don't need to do anything.
+*/
+   if (!svm)
+   goto done;
+   if (args->id != svm->pasid) {
+   pr_warn("Notify PASID does not match data %d : %d\n",
+   args->id, svm->pasid);
+   goto done;
+   }
+   if (!ioasid_queue_work(>work))
+   pr_warn("Cleanup work already queued\n");
+   return NOTIFY_OK;
+   }
+done:
+   return ret;
+}
+
+static struct notifier_block pasid_nb = {
+   .notifier_call = pasid_status_change,
+};

[PATCH V4 13/18] iommu/ioasid: Add a workqueue for cleanup work

2021-02-27 Thread Jacob Pan

An IOASID can have multiple users, such as IOMMU driver, KVM, and device
drivers.   The atomic IOASID notifier is used to inform users of IOASID
state change. For example, the IOASID_NOTIFY_UNBIND event is issued when
the IOASID is no longer bound to an address space. This requires ordered
actions among users to tear down their contexts.

Not all work can be handled in the atomic notifier handler. This patch
introduces a shared, ordered workqueue for all IOASID users who wish to
perform work asynchronously upon notification.

Signed-off-by: Jacob Pan 
---
 drivers/iommu/ioasid.c | 25 +
 include/linux/ioasid.h |  1 +
 2 files changed, 26 insertions(+)

diff --git a/drivers/iommu/ioasid.c b/drivers/iommu/ioasid.c
index 28a2e9b6594d..d42b39ca2c8b 100644
--- a/drivers/iommu/ioasid.c
+++ b/drivers/iommu/ioasid.c
@@ -32,6 +32,9 @@ static ioasid_t ioasid_capacity = PCI_PASID_MAX;
 static ioasid_t ioasid_capacity_avail = PCI_PASID_MAX;
 static DEFINE_XARRAY_ALLOC(ioasid_sets);
 
+/* Workqueue for IOASID users to do cleanup upon notification */
+static struct workqueue_struct *ioasid_wq;
+
 struct ioasid_set_nb {
struct list_headlist;
struct notifier_block   *nb;
@@ -1281,6 +1284,12 @@ int ioasid_register_notifier_mm(struct mm_struct *mm, 
struct notifier_block *nb)
 }
 EXPORT_SYMBOL_GPL(ioasid_register_notifier_mm);
 
+bool ioasid_queue_work(struct work_struct *work)
+{
+   return queue_work(ioasid_wq, work);
+}
+EXPORT_SYMBOL_GPL(ioasid_queue_work);
+
 void ioasid_unregister_notifier_mm(struct mm_struct *mm, struct notifier_block 
*nb)
 {
struct ioasid_set_nb *curr;
@@ -1303,7 +1312,23 @@ void ioasid_unregister_notifier_mm(struct mm_struct *mm, 
struct notifier_block *
 }
 EXPORT_SYMBOL_GPL(ioasid_unregister_notifier_mm);
 
+static int __init ioasid_init(void)
+{
+   ioasid_wq = alloc_ordered_workqueue("ioasid_wq", 0);
+   if (!ioasid_wq)
+   return -ENOMEM;
+
+   return 0;
+}
+
+static void __exit ioasid_cleanup(void)
+{
+   destroy_workqueue(ioasid_wq);
+}
+
 MODULE_AUTHOR("Jean-Philippe Brucker ");
 MODULE_AUTHOR("Jacob Pan ");
 MODULE_DESCRIPTION("IO Address Space ID (IOASID) allocator");
 MODULE_LICENSE("GPL");
+module_init(ioasid_init);
+module_exit(ioasid_cleanup);
diff --git a/include/linux/ioasid.h b/include/linux/ioasid.h
index 9624b665f810..4547086797df 100644
--- a/include/linux/ioasid.h
+++ b/include/linux/ioasid.h
@@ -135,6 +135,7 @@ void ioasid_set_for_each_ioasid(struct ioasid_set *sdata,
void *data);
 int ioasid_register_notifier_mm(struct mm_struct *mm, struct notifier_block 
*nb);
 void ioasid_unregister_notifier_mm(struct mm_struct *mm, struct notifier_block 
*nb);
+bool ioasid_queue_work(struct work_struct *work);
 #else /* !CONFIG_IOASID */
 static inline void ioasid_install_capacity(ioasid_t total)
 {
-- 
2.25.1

[PATCH V4 11/18] iommu/ioasid: Add ownership check in guest bind

2021-02-27 Thread Jacob Pan

Bind guest page table call comes with an IOASID provided by the
userspace.  To prevent attacks by malicious users, we must ensure the
IOASID was allocated under the same process.

This patch adds a new API that will perform an ownership check that is
based on whether the IOASID belongs to the ioasid_set allocated with the
mm_struct pointer as a token.

Signed-off-by: Liu Yi L 
Signed-off-by: Jacob Pan 
---
 drivers/iommu/ioasid.c | 37 +
 drivers/iommu/iommu.c  | 16 ++--
 include/linux/ioasid.h |  6 ++
 3 files changed, 57 insertions(+), 2 deletions(-)

diff --git a/drivers/iommu/ioasid.c b/drivers/iommu/ioasid.c
index 96e941dfada7..28a2e9b6594d 100644
--- a/drivers/iommu/ioasid.c
+++ b/drivers/iommu/ioasid.c
@@ -9,6 +9,7 @@
 #include 
 #include 
 #include 
+#include 
 
 /*
  * An IOASID can have multiple consumers where each consumer may have
@@ -1028,6 +1029,42 @@ int ioasid_get(struct ioasid_set *set, ioasid_t ioasid)
 }
 EXPORT_SYMBOL_GPL(ioasid_get);
 
+/**
+ * ioasid_get_if_owned - obtain a reference to the IOASID if the IOASID belongs
+ * to the ioasid_set with the current mm as token
+ * @ioasid:the IOASID to get reference
+ *
+ *
+ * Return: 0 on success, error if failed.
+ */
+int ioasid_get_if_owned(ioasid_t ioasid)
+{
+   struct ioasid_set *set;
+   int ret;
+
+   spin_lock(_allocator_lock);
+   set = ioasid_find_set(ioasid);
+   if (IS_ERR_OR_NULL(set)) {
+   ret = -ENOENT;
+   goto done_unlock;
+   }
+   if (set->type != IOASID_SET_TYPE_MM) {
+   ret = -EINVAL;
+   goto done_unlock;
+   }
+   if (current->mm != set->token) {
+   ret = -EPERM;
+   goto done_unlock;
+   }
+
+   ret = ioasid_get_locked(set, ioasid);
+done_unlock:
+   spin_unlock(_allocator_lock);
+
+   return ret;
+}
+EXPORT_SYMBOL_GPL(ioasid_get_if_owned);
+
 bool ioasid_put_locked(struct ioasid_set *set, ioasid_t ioasid)
 {
struct ioasid_data *data;
diff --git a/drivers/iommu/iommu.c b/drivers/iommu/iommu.c
index fd76e2f579fe..18716d856b02 100644
--- a/drivers/iommu/iommu.c
+++ b/drivers/iommu/iommu.c
@@ -2169,7 +2169,13 @@ int iommu_uapi_sva_bind_gpasid(struct iommu_domain 
*domain, struct device *dev,
if (ret)
return ret;
 
-   return domain->ops->sva_bind_gpasid(domain, dev, );
+   ret = ioasid_get_if_owned(data.hpasid);
+   if (ret)
+   return ret;
+   ret = domain->ops->sva_bind_gpasid(domain, dev, );
+   ioasid_put(NULL, data.hpasid);
+
+   return ret;
 }
 EXPORT_SYMBOL_GPL(iommu_uapi_sva_bind_gpasid);
 
@@ -2196,7 +2202,13 @@ int iommu_uapi_sva_unbind_gpasid(struct iommu_domain 
*domain, struct device *dev
if (ret)
return ret;
 
-   return iommu_sva_unbind_gpasid(domain, dev, data.hpasid);
+   ret = ioasid_get_if_owned(data.hpasid);
+   if (ret)
+   return ret;
+   ret = iommu_sva_unbind_gpasid(domain, dev, data.hpasid);
+   ioasid_put(NULL, data.hpasid);
+
+   return ret;
 }
 EXPORT_SYMBOL_GPL(iommu_uapi_sva_unbind_gpasid);
 
diff --git a/include/linux/ioasid.h b/include/linux/ioasid.h
index c97e80ff65cc..9624b665f810 100644
--- a/include/linux/ioasid.h
+++ b/include/linux/ioasid.h
@@ -111,6 +111,7 @@ ioasid_t ioasid_alloc(struct ioasid_set *set, ioasid_t min, 
ioasid_t max,
  void *private);
 int ioasid_get(struct ioasid_set *set, ioasid_t ioasid);
 int ioasid_get_locked(struct ioasid_set *set, ioasid_t ioasid);
+int ioasid_get_if_owned(ioasid_t ioasid);
 bool ioasid_put(struct ioasid_set *set, ioasid_t ioasid);
 bool ioasid_put_locked(struct ioasid_set *set, ioasid_t ioasid);
 void ioasid_free(struct ioasid_set *set, ioasid_t ioasid);
@@ -180,6 +181,11 @@ static inline int ioasid_get_locked(struct ioasid_set 
*set, ioasid_t ioasid)
return -ENOTSUPP;
 }
 
+static inline int ioasid_get_if_owned(ioasid_t ioasid)
+{
+   return -ENOTSUPP;
+}
+
 static inline bool ioasid_put(struct ioasid_set *set, ioasid_t ioasid)
 {
return false;
-- 
2.25.1

[PATCH V4 10/18] iommu/ioasid: Support mm token type ioasid_set notifications

2021-02-27 Thread Jacob Pan

As a system-wide resource, IOASID is often shared by multiple kernel
subsystems that are independent of each other. However, at the
ioasid_set level, these kernel subsystems must communicate with each
other for ownership checking, event notifications, etc. For example, on
Intel Scalable IO Virtualization (SIOV) enabled platforms, KVM and VFIO
instances under the same process/guest must be aware of a shared IOASID
set.
IOASID_SET_TYPE_MM token type was introduced to explicitly mark an
IOASID set that belongs to a process, thus use the same mm_struct
pointer as a token. Users of the same process can then identify with
each other based on this token.

This patch introduces MM token specific event registration APIs. Event
subscribers such as KVM instances can register IOASID event handler
without the knowledge of its ioasid_set. Event handlers are registered
based on its mm_struct pointer as a token. In case when subscribers
register handler *prior* to the creation of the ioasid_set, the
handler’s notification block is stored in a pending list within IOASID
core. Once the ioasid_set of the MM token is created, the notification
block will be registered by the IOASID core.

Signed-off-by: Liu Yi L 
Signed-off-by: Wu Hao 
Signed-off-by: Jacob Pan 
---
 drivers/iommu/ioasid.c | 142 +
 include/linux/ioasid.h |  18 ++
 2 files changed, 160 insertions(+)

diff --git a/drivers/iommu/ioasid.c b/drivers/iommu/ioasid.c
index 56577e745c4b..96e941dfada7 100644
--- a/drivers/iommu/ioasid.c
+++ b/drivers/iommu/ioasid.c
@@ -21,6 +21,8 @@
  * keep local states in sync.
  */
 static ATOMIC_NOTIFIER_HEAD(ioasid_notifier);
+/* List to hold pending notification block registrations */
+static LIST_HEAD(ioasid_nb_pending_list);
 static DEFINE_SPINLOCK(ioasid_nb_lock);
 
 /* Default to PCIe standard 20 bit PASID */
@@ -574,6 +576,27 @@ static inline bool ioasid_set_is_valid(struct ioasid_set 
*set)
return xa_load(_sets, set->id) == set;
 }
 
+static void ioasid_add_pending_nb(struct ioasid_set *set)
+{
+   struct ioasid_set_nb *curr;
+
+   if (set->type != IOASID_SET_TYPE_MM)
+   return;
+   /*
+* Check if there are any pending nb requests for the given token, if so
+* add them to the notifier chain.
+*/
+   spin_lock(_nb_lock);
+   list_for_each_entry(curr, _nb_pending_list, list) {
+   if (curr->token == set->token && !curr->active) {
+   atomic_notifier_chain_register(>nh, curr->nb);
+   curr->set = set;
+   curr->active = true;
+   }
+   }
+   spin_unlock(_nb_lock);
+}
+
 /**
  * ioasid_set_alloc - Allocate a new IOASID set for a given token
  *
@@ -658,6 +681,11 @@ struct ioasid_set *ioasid_set_alloc(void *token, ioasid_t 
quota, int type)
atomic_set(>nr_ioasids, 0);
ATOMIC_INIT_NOTIFIER_HEAD(>nh);
 
+   /*
+* Check if there are any pending nb requests for the given token, if so
+* add them to the notifier chain.
+*/
+   ioasid_add_pending_nb(set);
/*
 * Per set XA is used to store private IDs within the set, get ready
 * for ioasid_set private ID and system-wide IOASID allocation
@@ -675,6 +703,7 @@ EXPORT_SYMBOL_GPL(ioasid_set_alloc);
 
 static int ioasid_set_free_locked(struct ioasid_set *set)
 {
+   struct ioasid_set_nb *curr;
int ret = 0;
 
if (!ioasid_set_is_valid(set)) {
@@ -688,6 +717,16 @@ static int ioasid_set_free_locked(struct ioasid_set *set)
}
 
WARN_ON(!xa_empty(>xa));
+   /* Restore pending status of the set NBs */
+   list_for_each_entry(curr, _nb_pending_list, list) {
+   if (curr->token == set->token) {
+   if (curr->active)
+   curr->active = false;
+   else
+   pr_warn("Set token exists but not active!\n");
+   }
+   }
+
/*
 * Token got released right away after the ioasid_set is freed.
 * If a new set is created immediately with the newly released token,
@@ -1117,6 +1156,22 @@ EXPORT_SYMBOL_GPL(ioasid_register_notifier);
 void ioasid_unregister_notifier(struct ioasid_set *set,
struct notifier_block *nb)
 {
+   struct ioasid_set_nb *curr;
+
+   spin_lock(_nb_lock);
+   /*
+* Pending list is registered with a token without an ioasid_set,
+* therefore should not be unregistered directly.
+*/
+   list_for_each_entry(curr, _nb_pending_list, list) {
+   if (curr->nb == nb) {
+   pr_warn("Cannot unregister NB from pending list\n");
+   spin_unlock(_nb_lock);
+   return;
+   }
+   }
+   spin_unlock(_nb_lock);
+
i

[PATCH V4 09/18] iommu/ioasid: Introduce notification APIs

2021-02-27 Thread Jacob Pan

Relations among IOASID users largely follow a publisher-subscriber
pattern. E.g. to support guest SVA on Intel Scalable I/O Virtualization
(SIOV) enabled platforms, VFIO, IOMMU, device drivers, KVM are all users
of IOASIDs. When a state change occurs, VFIO publishes the change event
that needs to be processed by other users/subscribers.

This patch introduced two types of notifications: global and per
ioasid_set. The latter is intended for users who only needs to handle
events related to the IOASID of a given set.
For more information, refer to the kernel documentation at
Documentation/ioasid.rst.

Signed-off-by: Liu Yi L 
Signed-off-by: Wu Hao 
Signed-off-by: Jacob Pan 
---
 drivers/iommu/ioasid.c | 111 +++--
 include/linux/ioasid.h |  54 
 2 files changed, 161 insertions(+), 4 deletions(-)

diff --git a/drivers/iommu/ioasid.c b/drivers/iommu/ioasid.c
index 7707bb608bdd..56577e745c4b 100644
--- a/drivers/iommu/ioasid.c
+++ b/drivers/iommu/ioasid.c
@@ -10,12 +10,33 @@
 #include 
 #include 
 
+/*
+ * An IOASID can have multiple consumers where each consumer may have
+ * hardware contexts associated with the IOASID.
+ * When a status change occurs, like on IOASID deallocation, notifier chains
+ * are used to keep the consumers in sync.
+ * This is a publisher-subscriber pattern where publisher can change the
+ * state of each IOASID, e.g. alloc/free, bind IOASID to a device and mm.
+ * On the other hand, subscribers get notified for the state change and
+ * keep local states in sync.
+ */
+static ATOMIC_NOTIFIER_HEAD(ioasid_notifier);
+static DEFINE_SPINLOCK(ioasid_nb_lock);
+
 /* Default to PCIe standard 20 bit PASID */
 #define PCI_PASID_MAX 0x10
 static ioasid_t ioasid_capacity = PCI_PASID_MAX;
 static ioasid_t ioasid_capacity_avail = PCI_PASID_MAX;
 static DEFINE_XARRAY_ALLOC(ioasid_sets);
 
+struct ioasid_set_nb {
+   struct list_headlist;
+   struct notifier_block   *nb;
+   void*token;
+   struct ioasid_set   *set;
+   boolactive;
+};
+
 enum ioasid_state {
IOASID_STATE_IDLE,
IOASID_STATE_ACTIVE,
@@ -415,6 +436,38 @@ void ioasid_detach_data(ioasid_t ioasid)
 }
 EXPORT_SYMBOL_GPL(ioasid_detach_data);
 
+/**
+ * ioasid_notify - Send notification on a given IOASID for status change.
+ *
+ * @data:  The IOASID data to which the notification will send
+ * @cmd:   Notification event sent by IOASID external users, can be
+ * IOASID_BIND or IOASID_UNBIND.
+ *
+ * @flags: Special instructions, e.g. notify within a set or global by
+ * IOASID_NOTIFY_FLAG_SET or IOASID_NOTIFY_FLAG_ALL flags
+ * Caller must hold ioasid_allocator_lock and reference to the IOASID
+ */
+static int ioasid_notify(struct ioasid_data *data,
+enum ioasid_notify_val cmd, unsigned int flags)
+{
+   struct ioasid_nb_args args = { 0 };
+   int ret = 0;
+
+   if (flags & ~(IOASID_NOTIFY_FLAG_ALL | IOASID_NOTIFY_FLAG_SET))
+   return -EINVAL;
+
+   args.id = data->id;
+   args.set = data->set;
+   args.pdata = data->private;
+   args.spid = data->spid;
+   if (flags & IOASID_NOTIFY_FLAG_ALL)
+   ret = atomic_notifier_call_chain(_notifier, cmd, );
+   if (flags & IOASID_NOTIFY_FLAG_SET)
+   ret = atomic_notifier_call_chain(>set->nh, cmd, );
+
+   return ret;
+}
+
 static ioasid_t ioasid_find_by_spid_locked(struct ioasid_set *set, ioasid_t 
spid, bool get)
 {
ioasid_t ioasid = INVALID_IOASID;
@@ -468,7 +521,7 @@ int ioasid_attach_spid(ioasid_t ioasid, ioasid_t spid)
goto done_unlock;
}
data->spid = spid;
-
+   ioasid_notify(data, IOASID_NOTIFY_BIND, IOASID_NOTIFY_FLAG_SET);
 done_unlock:
spin_unlock(_allocator_lock);
return ret;
@@ -486,8 +539,8 @@ void ioasid_detach_spid(ioasid_t ioasid)
pr_err("Invalid IOASID entry %d to detach\n", ioasid);
goto done_unlock;
}
+   ioasid_notify(data, IOASID_NOTIFY_UNBIND, IOASID_NOTIFY_FLAG_SET);
data->spid = INVALID_IOASID;
-
 done_unlock:
spin_unlock(_allocator_lock);
 }
@@ -603,6 +656,8 @@ struct ioasid_set *ioasid_set_alloc(void *token, ioasid_t 
quota, int type)
set->quota = quota;
set->id = id;
atomic_set(>nr_ioasids, 0);
+   ATOMIC_INIT_NOTIFIER_HEAD(>nh);
+
/*
 * Per set XA is used to store private IDs within the set, get ready
 * for ioasid_set private ID and system-wide IOASID allocation
@@ -655,7 +710,9 @@ int ioasid_set_free(struct ioasid_set *set)
int ret = 0;
 
spin_lock(_allocator_lock);
+   spin_lock(_nb_lock);
ret = ioasid_set_free_locked(set);
+   spin_unlock(_nb_lock);
spin_unlock(_allocator_lock);
return ret;
 }
@@ -728,6 +785,7 @@

[PATCH V4 06/18] iommu/ioasid: Add free function and states

2021-02-27 Thread Jacob Pan

When an actively used IOASID is freed due to exceptions, users must be
notified to perform the cleanup. The IOASID shall be put in a pending
state until all users completed their cleanup work.

This patch adds ioasid_free() function to let the caller initiate the
freeing process. Both ioasid_free() and ioasid_put() decrements
reference counts. Unlike ioasid_put(), the ioasid_free() function also
transition the IOASID to the free pending state where further
ioasid_get() is prohibited. This paves the way for FREE event
notifications that will be introduced next.

Signed-off-by: Jacob Pan 
---
 drivers/iommu/ioasid.c | 73 ++
 include/linux/ioasid.h |  5 +++
 2 files changed, 78 insertions(+)

diff --git a/drivers/iommu/ioasid.c b/drivers/iommu/ioasid.c
index d7b476651027..a10f8154c680 100644
--- a/drivers/iommu/ioasid.c
+++ b/drivers/iommu/ioasid.c
@@ -15,8 +15,26 @@
 static ioasid_t ioasid_capacity = PCI_PASID_MAX;
 static ioasid_t ioasid_capacity_avail = PCI_PASID_MAX;
 static DEFINE_XARRAY_ALLOC(ioasid_sets);
+
+enum ioasid_state {
+   IOASID_STATE_IDLE,
+   IOASID_STATE_ACTIVE,
+   IOASID_STATE_FREE_PENDING,
+};
+
+/**
+ * struct ioasid_data - Meta data about ioasid
+ *
+ * @id:Unique ID
+ * @refs:  Number of active users
+ * @state: Track state of the IOASID
+ * @set:   ioasid_set of the IOASID belongs to
+ * @private:   Private data associated with the IOASID
+ * @rcu:   For free after RCU grace period
+ */
 struct ioasid_data {
ioasid_t id;
+   enum ioasid_state state;
struct ioasid_set *set;
void *private;
struct rcu_head rcu;
@@ -597,6 +615,7 @@ ioasid_t ioasid_alloc(struct ioasid_set *set, ioasid_t min, 
ioasid_t max,
goto exit_free;
}
data->id = id;
+   data->state = IOASID_STATE_IDLE;
 
/* Store IOASID in the per set data */
if (xa_err(xa_store(>xa, id, data, GFP_ATOMIC))) {
@@ -631,6 +650,56 @@ static void ioasid_do_free_locked(struct ioasid_data *data)
ioasid_set_free_locked(data->set);
 }
 
+static void ioasid_free_locked(struct ioasid_set *set, ioasid_t ioasid)
+{
+   struct ioasid_data *data;
+
+   data = xa_load(_allocator->xa, ioasid);
+   if (!data) {
+   pr_err_ratelimited("Trying to free unknown IOASID %u\n", 
ioasid);
+   return;
+   }
+   if (data->set != set) {
+   pr_warn("Cannot free IOASID %u due to set ownership\n", ioasid);
+   return;
+   }
+   /* Check if the set exists */
+   if (WARN_ON(!xa_load(_sets, data->set->id)))
+   return;
+
+   /* Free is already in progress */
+   if (data->state == IOASID_STATE_FREE_PENDING)
+   return;
+
+   data->state = IOASID_STATE_FREE_PENDING;
+   /*
+* If the refcount is 1, it means there is no other users of the IOASID
+* other than IOASID core itself. There is no need to notify anyone.
+*/
+   if (!refcount_dec_and_test(>refs))
+   return;
+
+   ioasid_do_free_locked(data);
+}
+
+/**
+ * ioasid_free - Drop reference on an IOASID. Free if refcount drops to 0,
+ *   including free from its set and system-wide list.
+ * @set:   The ioasid_set to check permission with. If not NULL, IOASID
+ * free will fail if the set does not match.
+ * @ioasid:The IOASID to remove
+ *
+ * TODO: return true if all references dropped, false if async work is in
+ * progress, IOASID is in FREE_PENDING state. wait queue to be used for 
blocking
+ * free task.
+ */
+void ioasid_free(struct ioasid_set *set, ioasid_t ioasid)
+{
+   spin_lock(_allocator_lock);
+   ioasid_free_locked(set, ioasid);
+   spin_unlock(_allocator_lock);
+}
+EXPORT_SYMBOL_GPL(ioasid_free);
 int ioasid_get_locked(struct ioasid_set *set, ioasid_t ioasid)
 {
struct ioasid_data *data;
@@ -640,6 +709,10 @@ int ioasid_get_locked(struct ioasid_set *set, ioasid_t 
ioasid)
pr_err("Trying to get unknown IOASID %u\n", ioasid);
return -EINVAL;
}
+   if (data->state == IOASID_STATE_FREE_PENDING) {
+   pr_err("Trying to get IOASID being freed%u\n", ioasid);
+   return -EBUSY;
+   }
 
/* Check set ownership if the set is non-null */
if (set && data->set != set) {
diff --git a/include/linux/ioasid.h b/include/linux/ioasid.h
index 095f4e50dc58..cabaf0b0348f 100644
--- a/include/linux/ioasid.h
+++ b/include/linux/ioasid.h
@@ -72,6 +72,7 @@ int ioasid_get(struct ioasid_set *set, ioasid_t ioasid);
 int ioasid_get_locked(struct ioasid_set *set, ioasid_t ioasid);
 bool ioasid_put(struct ioasid_set *set, ioasid_t ioasid);
 bool ioasid_put_locked(struct ioasid_set *set, ioasid_t ioasid);
+void ioasid_free(struct ioasid_set *set, ioasid_t ioas

[PATCH V4 08/18] iommu/ioasid: Introduce ioasid_set private ID

2021-02-27 Thread Jacob Pan

When an IOASID set is used for guest SVA, each VM will acquire its
ioasid_set for IOASID allocations. IOASIDs within the VM must have a
host/physical IOASID backing, mapping between guest and host IOASIDs can
be non-identical. IOASID set private ID (SPID) is introduced in this
patch to be used as guest IOASID. However, the concept of ioasid_set
specific namespace is generic, thus named SPID.

As SPID namespace is within the IOASID set, the IOASID core can provide
lookup services at both directions. SPIDs may not be available when its
IOASID is allocated, the mapping between SPID and IOASID is usually
established when a guest page table is bound to a host PASID.

Signed-off-by: Jacob Pan 
---
 drivers/iommu/ioasid.c | 104 +
 include/linux/ioasid.h |  18 +++
 2 files changed, 122 insertions(+)

diff --git a/drivers/iommu/ioasid.c b/drivers/iommu/ioasid.c
index 9a3ba157dec3..7707bb608bdd 100644
--- a/drivers/iommu/ioasid.c
+++ b/drivers/iommu/ioasid.c
@@ -26,6 +26,7 @@ enum ioasid_state {
  * struct ioasid_data - Meta data about ioasid
  *
  * @id:Unique ID
+ * @spid:  Private ID unique within a set
  * @refs:  Number of active users
  * @state: Track state of the IOASID
  * @set:   ioasid_set of the IOASID belongs to
@@ -34,6 +35,7 @@ enum ioasid_state {
  */
 struct ioasid_data {
ioasid_t id;
+   ioasid_t spid;
enum ioasid_state state;
struct ioasid_set *set;
void *private;
@@ -413,6 +415,107 @@ void ioasid_detach_data(ioasid_t ioasid)
 }
 EXPORT_SYMBOL_GPL(ioasid_detach_data);
 
+static ioasid_t ioasid_find_by_spid_locked(struct ioasid_set *set, ioasid_t 
spid, bool get)
+{
+   ioasid_t ioasid = INVALID_IOASID;
+   struct ioasid_data *entry;
+   unsigned long index;
+
+   if (!xa_load(_sets, set->id)) {
+   pr_warn("Invalid set\n");
+   goto done;
+   }
+
+   xa_for_each(>xa, index, entry) {
+   if (spid == entry->spid) {
+   if (get)
+   refcount_inc(>refs);
+   ioasid = index;
+   }
+   }
+done:
+   return ioasid;
+}
+
+/**
+ * ioasid_attach_spid - Attach ioasid_set private ID to an IOASID
+ *
+ * @ioasid: the system-wide IOASID to attach
+ * @spid:   the ioasid_set private ID of @ioasid
+ *
+ * After attching SPID, future lookup can be done via ioasid_find_by_spid().
+ */
+int ioasid_attach_spid(ioasid_t ioasid, ioasid_t spid)
+{
+   struct ioasid_data *data;
+   int ret = 0;
+
+   if (spid == INVALID_IOASID)
+   return -EINVAL;
+
+   spin_lock(_allocator_lock);
+   data = xa_load(_allocator->xa, ioasid);
+
+   if (!data) {
+   pr_err("No IOASID entry %d to attach SPID %d\n",
+   ioasid, spid);
+   ret = -ENOENT;
+   goto done_unlock;
+   }
+   /* Check if SPID is unique within the set */
+   if (ioasid_find_by_spid_locked(data->set, spid, false) != 
INVALID_IOASID) {
+   ret = -EINVAL;
+   goto done_unlock;
+   }
+   data->spid = spid;
+
+done_unlock:
+   spin_unlock(_allocator_lock);
+   return ret;
+}
+EXPORT_SYMBOL_GPL(ioasid_attach_spid);
+
+void ioasid_detach_spid(ioasid_t ioasid)
+{
+   struct ioasid_data *data;
+
+   spin_lock(_allocator_lock);
+   data = xa_load(_allocator->xa, ioasid);
+
+   if (!data || data->spid == INVALID_IOASID) {
+   pr_err("Invalid IOASID entry %d to detach\n", ioasid);
+   goto done_unlock;
+   }
+   data->spid = INVALID_IOASID;
+
+done_unlock:
+   spin_unlock(_allocator_lock);
+}
+EXPORT_SYMBOL_GPL(ioasid_detach_spid);
+
+/**
+ * ioasid_find_by_spid - Find the system-wide IOASID by a set private ID and
+ * its set.
+ *
+ * @set:   the ioasid_set to search within
+ * @spid:  the set private ID
+ * @get:   flag indicates whether to take a reference once found
+ *
+ * Given a set private ID and its IOASID set, find the system-wide IOASID. Take
+ * a reference upon finding the matching IOASID if @get is true. Return
+ * INVALID_IOASID if the IOASID is not found in the set or the set is not 
valid.
+ */
+ioasid_t ioasid_find_by_spid(struct ioasid_set *set, ioasid_t spid, bool get)
+{
+   ioasid_t ioasid;
+
+   spin_lock(_allocator_lock);
+   ioasid = ioasid_find_by_spid_locked(set, spid, get);
+   spin_unlock(_allocator_lock);
+   return ioasid;
+}
+EXPORT_SYMBOL_GPL(ioasid_find_by_spid);
+
 static inline bool ioasid_set_is_valid(struct ioasid_set *set)
 {
return xa_load(_sets, set->id) == set;
@@ -616,6 +719,7 @@ ioasid_t ioasid_alloc(struct ioasid_set *set, ioasid_t min, 
ioasid_t max,
}
data->id = id;
data->state = IOASID_STATE_IDLE;
+   data->spid = INVALID_IOASID;
 
/* Store

[PATCH V4 05/18] iommu/ioasid: Redefine IOASID set and allocation APIs

2021-02-27 Thread Jacob Pan

ioasid_set was introduced as an arbitrary token that is shared by a
group of IOASIDs. For example, two IOASIDs allocated via the same
ioasid_set pointer belong to the same set.

For guest SVA usages, system-wide IOASID resources need to be
partitioned such that each VM can have its own quota and being managed
separately. ioasid_set is the perfect candidate for meeting such
requirements. This patch redefines and extends ioasid_set with the
following new fields:
- Quota
- Reference count
- Storage of its namespace
- The token is now stored in the ioasid_set with types

Basic ioasid_set level APIs are introduced that wire up these new data.
Existing users of IOASID APIs are converted where a host IOASID set is
allocated for bare-metal usages. Including VT-d driver and
iommu-sva-lib.

Signed-off-by: Liu Yi L 
Signed-off-by: Jacob Pan 
---
 .../iommu/arm/arm-smmu-v3/arm-smmu-v3-sva.c   |   1 +
 drivers/iommu/intel/iommu.c   |  27 +-
 drivers/iommu/intel/pasid.h   |   1 +
 drivers/iommu/intel/svm.c |  25 +-
 drivers/iommu/ioasid.c| 288 +++---
 drivers/iommu/iommu-sva-lib.c |  19 +-
 include/linux/ioasid.h|  68 -
 7 files changed, 361 insertions(+), 68 deletions(-)

diff --git a/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3-sva.c 
b/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3-sva.c
index e13b092e6004..588aa66ed5e4 100644
--- a/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3-sva.c
+++ b/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3-sva.c
@@ -459,6 +459,7 @@ int arm_smmu_master_enable_sva(struct arm_smmu_master 
*master)
 {
mutex_lock(_lock);
master->sva_enabled = true;
+   iommu_sva_init();
mutex_unlock(_lock);
 
return 0;
diff --git a/drivers/iommu/intel/iommu.c b/drivers/iommu/intel/iommu.c
index 6f42ff7d171d..eb9868061545 100644
--- a/drivers/iommu/intel/iommu.c
+++ b/drivers/iommu/intel/iommu.c
@@ -103,6 +103,9 @@
  */
 #define INTEL_IOMMU_PGSIZES(~0xFFFUL)
 
+/* PASIDs used by host SVM */
+struct ioasid_set *host_pasid_set;
+
 static inline int agaw_to_level(int agaw)
 {
return agaw + 2;
@@ -173,6 +176,7 @@ static struct intel_iommu **g_iommus;
 
 static void __init check_tylersburg_isoch(void);
 static int rwbf_quirk;
+static bool scalable_mode_support(void);
 
 /*
  * set to 1 to panic kernel if can't successfully enable VT-d
@@ -3114,8 +3118,8 @@ static void intel_vcmd_ioasid_free(ioasid_t ioasid, void 
*data)
 * Sanity check the ioasid owner is done at upper layer, e.g. VFIO
 * We can only free the PASID when all the devices are unbound.
 */
-   if (ioasid_find(NULL, ioasid, NULL)) {
-   pr_alert("Cannot free active IOASID %d\n", ioasid);
+   if (IS_ERR(ioasid_find(host_pasid_set, ioasid, NULL))) {
+   pr_err("IOASID %d to be freed but not in system set\n", ioasid);
return;
}
vcmd_free_pasid(iommu, ioasid);
@@ -3300,8 +3304,17 @@ static int __init init_dmars(void)
goto free_iommu;
 
/* PASID is needed for scalable mode irrespective to SVM */
-   if (intel_iommu_sm)
+   if (scalable_mode_support()) {
ioasid_install_capacity(intel_pasid_max_id);
+   /* We should not run out of IOASIDs at boot */
+   host_pasid_set = ioasid_set_alloc(NULL, PID_MAX_DEFAULT,
+ IOASID_SET_TYPE_NULL);
+   if (IS_ERR_OR_NULL(host_pasid_set)) {
+   pr_err("Failed to allocate host PASID set %lu\n",
+   PTR_ERR(host_pasid_set));
+   intel_iommu_sm = 0;
+   }
+   }
 
/*
 * for each drhd
@@ -3348,7 +3361,7 @@ static int __init init_dmars(void)
disable_dmar_iommu(iommu);
free_dmar_iommu(iommu);
}
-
+   ioasid_set_free(host_pasid_set);
kfree(g_iommus);
 
 error:
@@ -4573,7 +4586,7 @@ static int aux_domain_add_dev(struct dmar_domain *domain,
u32 pasid;
 
/* No private data needed for the default pasid */
-   pasid = ioasid_alloc(NULL, PASID_MIN,
+   pasid = ioasid_alloc(host_pasid_set, PASID_MIN,
 pci_max_pasids(to_pci_dev(dev)) - 1,
 NULL);
if (pasid == INVALID_IOASID) {
@@ -4630,7 +4643,7 @@ static int aux_domain_add_dev(struct dmar_domain *domain,
 link_failed:
spin_unlock_irqrestore(_domain_lock, flags);
if (list_empty(>subdevices) && domain->default_pasid > 0)
-   ioasid_put(domain->default_pasid);
+   ioasid_put(host_pasid_set, domain->default_pasid);
 
return ret;
 }
@@ -4660,7 +4673,7 @@ static void aux_domain_remove_dev(struct dmar_domain 
*do

[PATCH V4 07/18] iommu/ioasid: Add ioasid_set iterator helper functions

2021-02-27 Thread Jacob Pan

Users of an ioasid_set may not keep track of all the IOASIDs allocated
under the set. When collective actions are needed for each IOASIDs, it
is useful to iterate over all the IOASIDs within the set. For example,
when the ioasid_set is freed, the user might perform the same cleanup
operation on each IOASID.

This patch adds an API to iterate all the IOASIDs within the set.

Signed-off-by: Jacob Pan 
---
 drivers/iommu/ioasid.c | 84 ++
 include/linux/ioasid.h | 20 ++
 2 files changed, 104 insertions(+)

diff --git a/drivers/iommu/ioasid.c b/drivers/iommu/ioasid.c
index a10f8154c680..9a3ba157dec3 100644
--- a/drivers/iommu/ioasid.c
+++ b/drivers/iommu/ioasid.c
@@ -700,6 +700,61 @@ void ioasid_free(struct ioasid_set *set, ioasid_t ioasid)
spin_unlock(_allocator_lock);
 }
 EXPORT_SYMBOL_GPL(ioasid_free);
+
+/**
+ * ioasid_free_all_in_set
+ *
+ * @brief
+ * Free all PASIDs from system-wide IOASID pool, all subscribers gets
+ * notified and do cleanup of their own.
+ * Note that some references of the IOASIDs within the set can still
+ * be held after the free call. This is OK in that the IOASIDs will be
+ * marked inactive, the only operations can be done is ioasid_put.
+ * No need to track IOASID set states since there is no reclaim phase.
+ *
+ * @param
+ * struct ioasid_set where all IOASIDs within the set will be freed.
+ */
+void ioasid_free_all_in_set(struct ioasid_set *set)
+{
+   struct ioasid_data *entry;
+   unsigned long index;
+
+   if (!ioasid_set_is_valid(set))
+   return;
+
+   if (xa_empty(>xa))
+   return;
+
+   if (!atomic_read(>nr_ioasids))
+   return;
+   spin_lock(_allocator_lock);
+   xa_for_each(>xa, index, entry) {
+   ioasid_free_locked(set, index);
+   /* Free from per set private pool */
+   xa_erase(>xa, index);
+   }
+   spin_unlock(_allocator_lock);
+}
+EXPORT_SYMBOL_GPL(ioasid_free_all_in_set);
+
+/**
+ * ioasid_set_for_each_ioasid
+ * @brief
+ * Iterate over all the IOASIDs within the set
+ */
+void ioasid_set_for_each_ioasid(struct ioasid_set *set,
+   void (*fn)(ioasid_t id, void *data),
+   void *data)
+{
+   struct ioasid_data *entry;
+   unsigned long index;
+
+   xa_for_each(>xa, index, entry)
+   fn(index, data);
+}
+EXPORT_SYMBOL_GPL(ioasid_set_for_each_ioasid);
+
 int ioasid_get_locked(struct ioasid_set *set, ioasid_t ioasid)
 {
struct ioasid_data *data;
@@ -789,6 +844,35 @@ bool ioasid_put(struct ioasid_set *set, ioasid_t ioasid)
 }
 EXPORT_SYMBOL_GPL(ioasid_put);
 
+/**
+ * @brief
+ * Find the ioasid_set of an IOASID. As long as the IOASID is valid,
+ * the set must be valid since the refcounting is based on the number of IOASID
+ * in the set.
+ *
+ * @param ioasid
+ * @return struct ioasid_set*
+ */
+struct ioasid_set *ioasid_find_set(ioasid_t ioasid)
+{
+   struct ioasid_allocator_data *idata;
+   struct ioasid_data *ioasid_data;
+   struct ioasid_set *set = NULL;
+
+   rcu_read_lock();
+   idata = rcu_dereference(active_allocator);
+   ioasid_data = xa_load(>xa, ioasid);
+   if (!ioasid_data) {
+   set = ERR_PTR(-ENOENT);
+   goto unlock;
+   }
+   set = ioasid_data->set;
+unlock:
+   rcu_read_unlock();
+   return set;
+}
+EXPORT_SYMBOL_GPL(ioasid_find_set);
+
 /**
  * ioasid_find - Find IOASID data
  * @set: the IOASID set
diff --git a/include/linux/ioasid.h b/include/linux/ioasid.h
index cabaf0b0348f..e7f3e6108724 100644
--- a/include/linux/ioasid.h
+++ b/include/linux/ioasid.h
@@ -73,12 +73,17 @@ int ioasid_get_locked(struct ioasid_set *set, ioasid_t 
ioasid);
 bool ioasid_put(struct ioasid_set *set, ioasid_t ioasid);
 bool ioasid_put_locked(struct ioasid_set *set, ioasid_t ioasid);
 void ioasid_free(struct ioasid_set *set, ioasid_t ioasid);
+void ioasid_free_all_in_set(struct ioasid_set *set);
 void *ioasid_find(struct ioasid_set *set, ioasid_t ioasid,
  bool (*getter)(void *));
+struct ioasid_set *ioasid_find_set(ioasid_t ioasid);
 int ioasid_register_allocator(struct ioasid_allocator_ops *allocator);
 void ioasid_unregister_allocator(struct ioasid_allocator_ops *allocator);
 int ioasid_attach_data(ioasid_t ioasid, void *data);
 void ioasid_detach_data(ioasid_t ioasid);
+void ioasid_set_for_each_ioasid(struct ioasid_set *sdata,
+   void (*fn)(ioasid_t id, void *data),
+   void *data);
 #else /* !CONFIG_IOASID */
 static inline void ioasid_install_capacity(ioasid_t total)
 {
@@ -158,5 +163,20 @@ static inline int ioasid_attach_data(ioasid_t ioasid, void 
*data)
 static inline void ioasid_detach_data(ioasid_t ioasid)
 {
 }
+
+static inline void ioasid_free_all_in_set(struct ioasid_set *set)
+{
+}
+
+static inline struct ioasid_set *ioasid_fin

[PATCH V4 02/18] iommu/ioasid: Rename ioasid_set_data()

2021-02-27 Thread Jacob Pan

Rename ioasid_set_data() to ioasid_attach_data() to avoid confusion with
struct ioasid_set. ioasid_set is a group of IOASIDs that share a common
token.

Reviewed-by: Jean-Philippe Brucker 
Signed-off-by: Jacob Pan 
---
 drivers/iommu/intel/svm.c | 6 +++---
 drivers/iommu/ioasid.c| 6 +++---
 include/linux/ioasid.h| 4 ++--
 3 files changed, 8 insertions(+), 8 deletions(-)

diff --git a/drivers/iommu/intel/svm.c b/drivers/iommu/intel/svm.c
index 18a9f05df407..0053df9edffc 100644
--- a/drivers/iommu/intel/svm.c
+++ b/drivers/iommu/intel/svm.c
@@ -371,7 +371,7 @@ int intel_svm_bind_gpasid(struct iommu_domain *domain, 
struct device *dev,
svm->gpasid = data->gpasid;
svm->flags |= SVM_FLAG_GUEST_PASID;
}
-   ioasid_set_data(data->hpasid, svm);
+   ioasid_attach_data(data->hpasid, svm);
INIT_LIST_HEAD_RCU(>devs);
mmput(svm->mm);
}
@@ -425,7 +425,7 @@ int intel_svm_bind_gpasid(struct iommu_domain *domain, 
struct device *dev,
list_add_rcu(>list, >devs);
  out:
if (!IS_ERR_OR_NULL(svm) && list_empty(>devs)) {
-   ioasid_set_data(data->hpasid, NULL);
+   ioasid_attach_data(data->hpasid, NULL);
kfree(svm);
}
 
@@ -468,7 +468,7 @@ int intel_svm_unbind_gpasid(struct device *dev, u32 pasid)
 * the unbind, IOMMU driver will get notified
 * and perform cleanup.
 */
-   ioasid_set_data(pasid, NULL);
+   ioasid_attach_data(pasid, NULL);
kfree(svm);
}
}
diff --git a/drivers/iommu/ioasid.c b/drivers/iommu/ioasid.c
index 50ee27bbd04e..eeadf4586e0a 100644
--- a/drivers/iommu/ioasid.c
+++ b/drivers/iommu/ioasid.c
@@ -259,14 +259,14 @@ void ioasid_unregister_allocator(struct 
ioasid_allocator_ops *ops)
 EXPORT_SYMBOL_GPL(ioasid_unregister_allocator);
 
 /**
- * ioasid_set_data - Set private data for an allocated ioasid
+ * ioasid_attach_data - Set private data for an allocated ioasid
  * @ioasid: the ID to set data
  * @data:   the private data
  *
  * For IOASID that is already allocated, private data can be set
  * via this API. Future lookup can be done via ioasid_find.
  */
-int ioasid_set_data(ioasid_t ioasid, void *data)
+int ioasid_attach_data(ioasid_t ioasid, void *data)
 {
struct ioasid_data *ioasid_data;
int ret = 0;
@@ -288,7 +288,7 @@ int ioasid_set_data(ioasid_t ioasid, void *data)
 
return ret;
 }
-EXPORT_SYMBOL_GPL(ioasid_set_data);
+EXPORT_SYMBOL_GPL(ioasid_attach_data);
 
 /**
  * ioasid_alloc - Allocate an IOASID
diff --git a/include/linux/ioasid.h b/include/linux/ioasid.h
index e9dacd4b9f6b..60ea279802b8 100644
--- a/include/linux/ioasid.h
+++ b/include/linux/ioasid.h
@@ -40,7 +40,7 @@ void *ioasid_find(struct ioasid_set *set, ioasid_t ioasid,
  bool (*getter)(void *));
 int ioasid_register_allocator(struct ioasid_allocator_ops *allocator);
 void ioasid_unregister_allocator(struct ioasid_allocator_ops *allocator);
-int ioasid_set_data(ioasid_t ioasid, void *data);
+int ioasid_attach_data(ioasid_t ioasid, void *data);
 
 #else /* !CONFIG_IOASID */
 static inline ioasid_t ioasid_alloc(struct ioasid_set *set, ioasid_t min,
@@ -73,7 +73,7 @@ static inline void ioasid_unregister_allocator(struct 
ioasid_allocator_ops *allo
 {
 }
 
-static inline int ioasid_set_data(ioasid_t ioasid, void *data)
+static inline int ioasid_attach_data(ioasid_t ioasid, void *data)
 {
return -ENOTSUPP;
 }
-- 
2.25.1

[PATCH V4 01/18] docs: Document IO Address Space ID (IOASID) APIs

2021-02-27 Thread Jacob Pan

IOASID is used to identify address spaces that can be targeted by device
DMA. It is a system-wide resource that is essential to its many users.
This document is an attempt to help developers from all vendors navigate
the APIs. At this time, ARM SMMU and Intel’s Scalable IO Virtualization
(SIOV) enabled platforms are the primary users of IOASID. Examples of
how SIOV components interact with the IOASID APIs are provided.

Cc: Jonathan Corbet 
Cc: linux-...@vger.kernel.org
Cc: Randy Dunlap 
Signed-off-by: Liu Yi L 
Signed-off-by: Wu Hao 
Signed-off-by: Jacob Pan 
---
 Documentation/driver-api/index.rst  |   1 +
 Documentation/driver-api/ioasid.rst | 510 
 2 files changed, 511 insertions(+)
 create mode 100644 Documentation/driver-api/ioasid.rst

diff --git a/Documentation/driver-api/index.rst 
b/Documentation/driver-api/index.rst
index 2456d0a97ed8..baeec308cf2c 100644
--- a/Documentation/driver-api/index.rst
+++ b/Documentation/driver-api/index.rst
@@ -37,6 +37,7 @@ available subsections can be seen below.
pci/index
spi
i2c
+   ioasid
ipmb
ipmi
i3c/index
diff --git a/Documentation/driver-api/ioasid.rst 
b/Documentation/driver-api/ioasid.rst
new file mode 100644
index ..f3ed5bf43fa6
--- /dev/null
+++ b/Documentation/driver-api/ioasid.rst
@@ -0,0 +1,510 @@
+.. SPDX-License-Identifier: GPL-2.0
+.. ioasid:
+
+=
+ IO Address Space ID
+=
+
+IOASIDs are used to identify virtual address spaces that DMA requests can
+target. It is a generic name for PCIe Process Address ID (PASID) or
+SubstreamID defined by ARM's SMMU.
+
+The primary use cases for IOASIDs are Shared Virtual Address (SVA) and
+IO Virtual Address (IOVA) when multiple address spaces per device are
+desired. Due to hardware architectural differences the requirements for
+IOASID management can vary in terms of namespace, state management, and
+virtualization usages.
+
+The IOASID subsystem consists of three components:
+
+- IOASID core: provides APIs for allocation, pool management,
+  notifications and refcounting.
+- IOASID user:  provides user allocation interface via /dev/ioasid
+- IOASID cgroup controller: manage resource distribution.
+  (Documentation/admin-guide/cgroup-v1/ioasids.rst)
+
+This document covers the features supported by the IOASID core APIs.
+Vendor-specific use cases are also illustrated with Intel's VT-d
+based platforms as the first example. The term PASID and IOASID are used
+interchangeably throughout this document.
+
+.. contents:: :local:
+
+Glossary
+
+PASID - Process Address Space ID
+
+IOVA - IO Virtual Address
+
+IOASID - IO Address Space ID (generic term for PCIe PASID and
+SubstreamID in SMMU)
+
+SVA/SVM - Shared Virtual Addressing/Memory
+
+gSVA - Guest Shared Virtual Addressing
+
+gIOVA - Guest IO Virtual Addressing
+
+ENQCMD - Instruction to submit work to shared workqueues. Refer
+to "Intel X86 ISA for efficient workqueue submission" [1]
+
+DSA - Intel Data Streaming Accelerator [2]
+
+VDCM - Virtual Device Composition Module [3]
+
+SIOV - Intel Scalable IO Virtualization
+
+DWQ - Dedicated Work Queue
+
+SWQ - Shared Work Queue
+
+1. 
https://software.intel.com/sites/default/files/managed/c5/15/architecture-instruction-set-extensions-programming-reference.pdf
+
+2. https://01.org/blogs/2019/introducing-intel-data-streaming-accelerator
+
+3. 
https://software.intel.com/en-us/download/intel-data-streaming-accelerator-preliminary-architecture-specification
+
+
+Key Concepts
+
+
+IOASID Set
+--
+An IOASID set is a group of IOASIDs allocated from the system-wide
+IOASID pool. Refer to section "IOASID Set Level APIs" for more details.
+
+IOASID set is particularly useful for guest SVA where each guest could
+have its own IOASID set for security and efficiency reasons.
+
+Guest IOASID
+--
+IOASID used by the guest, identifies a guest IOVA space or a guest VA
+space per guest process.
+
+Host IOASID
+-
+IOASID used by the host either for bare metal SVA or as the backing of a
+guest IOASID.
+
+Bind/Unbind
+---
+Refer to the process where mappings among IOASID, page tables, and devices
+are established/demolished. This usually involes setting up an entry of
+the IOMMU's per device PASID table with a given PGD.
+
+IOASID Set Private ID (SPID)
+
+Each IOASID set has a private namespace of SPIDs. An SPID maps to a
+single system-wide IOASID. Conversely, each IOASID may be associated
+with an alias ID, local to the IOASID set, named SPID.
+SPIDs can be used as guest IOASIDs where each guest could do
+IOASID allocation from its own pool/set and map them to host physical
+IOASIDs. SPIDs are particularly useful for supporting live migration
+where decoupling guest and host physical resources are necessary. Guest
+to Host PASID mapping can be torn down and re-established. Storing the
+mapping inside the kernel

[PATCH V4 00/18] IOASID extensions for guest SVA

2021-02-27 Thread Jacob Pan

I/O Address Space ID (IOASID) core code was introduced in v5.5 as a generic
kernel allocator service for both PCIe Process Address Space ID (PASID) and
ARM SMMU's Substream ID. IOASIDs are used to associate DMA requests with
virtual address spaces, including both host and guest.

In addition to providing basic ID allocation, ioasid_set was defined as a
token that is shared by a group of IOASIDs. This set token can be used
for permission checking, but lack some features to address the following
needs by guest Shared Virtual Address (SVA).
- Manage IOASIDs by group, group ownership, quota, etc.
- State synchronization among IOASID users (e.g. IOMMU driver, KVM, device
drivers)
- Non-identity guest-host IOASID mapping
- Lifecycle management

This patchset introduces the following extensions as solutions to the
problems above.
- Redefine and extend IOASID set such that IOASIDs can be managed by 
groups/pools.
- Add notifications for IOASID state synchronization
- Extend reference counting for life cycle alignment among multiple users
- Support ioasid_set private IDs, which can be used as guest IOASIDs
- Add a new cgroup controller for resource distribution

Please refer to Documentation/admin-guide/cgroup-v1/ioasids.rst and
Documentation/driver-api/ioasid.rst in the enclosed patches for more
details.

Based on discussions on LKML[1], a direction change was made in v4 such that
the user interfaces for IOASID allocation are extracted from VFIO
subsystem. The proposed IOASID subsystem now consists of three components:
1. IOASID core[01-14]: provides APIs for allocation, pool management,
  notifications, and refcounting.
2. IOASID cgroup controller[RFC 15-17]: manage resource distribution[2].
3. IOASID user[RFC 18]:  provides user allocation interface via /dev/ioasid 

This patchset only included VT-d driver as users of some of the new APIs.
VFIO and KVM patches are coming up to fully utilize the APIs introduced here.

[1] 
https://lore.kernel.org/linux-iommu/1599734733-6431-1-git-send-email-yi.l@intel.com/
[2] Note that ioasid quota management code can be removed once the IOASIDs
cgroup is ratified.

You can find this series, VFIO, KVM, and IOASID user at:
https://github.com/jacobpan/linux.git ioasid_v4
(VFIO and KVM patches will be available at this branch when published.)

This work is a result of collaboration with many people:
Liu, Yi L 
Wu Hao 
Ashok Raj 
Kevin Tian 

Thanks,

Jacob

Changelog:

v4
- Introduced IOASIDs cgroup controller
- Introduced /dev/ioasid user API for allocation/free
- Added IOASID states and free function, aligned refcounting on v5.11
  introduced by Jean.
- Support iommu-sva-lib (will converge VT-d code afterward)
- Added a shared ordered workqueue for notification work that requires
  thread context. Streamlined notification framework among multiple IOASID
  users.
- Added ioasid_set helper functions for taking per set operations

V3:
- Use consistent ioasid_set_ prefix for ioasid_set level APIs
- Make SPID and private detach/attach APIs symmetric
- Use the same ioasid_put semantics as Jean-Phillippe IOASID reference patch
- Take away the public ioasid_notify() function, notifications are now emitted
  by IOASID core as a result of certain IOASID APIs
- Partition into finer incremental patches
- Miscellaneous cleanup, locking, exception handling fixes based on v2 reviews

V2:
- Redesigned ioasid_set APIs, removed set ID
- Added set private ID (SPID) for guest PASID usage.
- Add per ioasid_set notification and priority support.
- Back to use spinlocks and atomic notifications.
- Added async work in VT-d driver to perform teardown outside atomic context


Jacob Pan (17):
  docs: Document IO Address Space ID (IOASID) APIs
  iommu/ioasid: Rename ioasid_set_data()
  iommu/ioasid: Add a separate function for detach data
  iommu/ioasid: Support setting system-wide capacity
  iommu/ioasid: Redefine IOASID set and allocation APIs
  iommu/ioasid: Add free function and states
  iommu/ioasid: Add ioasid_set iterator helper functions
  iommu/ioasid: Introduce ioasid_set private ID
  iommu/ioasid: Introduce notification APIs
  iommu/ioasid: Support mm token type ioasid_set notifications
  iommu/ioasid: Add ownership check in guest bind
  iommu/vt-d: Remove mm reference for guest SVA
  iommu/ioasid: Add a workqueue for cleanup work
  iommu/vt-d: Listen to IOASID notifications
  cgroup: Introduce ioasids controller
  iommu/ioasid: Consult IOASIDs cgroup for allocation
  docs: cgroup-v1: Add IOASIDs controller

Liu Yi L (1):
  ioasid: Add /dev/ioasid for userspace

 Documentation/admin-guide/cgroup-v1/index.rst |   1 +
 .../admin-guide/cgroup-v1/ioasids.rst | 107 ++
 Documentation/driver-api/index.rst|   1 +
 Documentation/driver-api/ioasid.rst   | 510 +
 Documentation/userspace-api/index.rst |   1 +
 Documentation/userspace-api/ioasid.rst|  49 +
 drivers/iommu/Kconfig |   5 +
 drivers/iommu/Makefile

[PATCH V4 04/18] iommu/ioasid: Support setting system-wide capacity

2021-02-27 Thread Jacob Pan

IOASID is a system-wide resource that could vary on different systems.
The default capacity is 20 bits as defined in the PCI-E specifications.
This patch adds a function to allow adjusting system IOASID capacity.
For VT-d this is set during boot as part of the Intel IOMMU
initialization. APIs also added to support runtime capacity reservation,
potentially by cgroups.

Signed-off-by: Jacob Pan 
---
 drivers/iommu/intel/iommu.c |  5 +++
 drivers/iommu/ioasid.c  | 70 +
 include/linux/ioasid.h  | 18 ++
 3 files changed, 93 insertions(+)

diff --git a/drivers/iommu/intel/iommu.c b/drivers/iommu/intel/iommu.c
index f665322a0991..6f42ff7d171d 100644
--- a/drivers/iommu/intel/iommu.c
+++ b/drivers/iommu/intel/iommu.c
@@ -41,6 +41,7 @@
 #include 
 #include 
 #include 
+#include 
 #include 
 #include 
 #include 
@@ -3298,6 +3299,10 @@ static int __init init_dmars(void)
if (ret)
goto free_iommu;
 
+   /* PASID is needed for scalable mode irrespective to SVM */
+   if (intel_iommu_sm)
+   ioasid_install_capacity(intel_pasid_max_id);
+
/*
 * for each drhd
 *   enable fault log
diff --git a/drivers/iommu/ioasid.c b/drivers/iommu/ioasid.c
index 4eb9b3dd1b85..28681b99340b 100644
--- a/drivers/iommu/ioasid.c
+++ b/drivers/iommu/ioasid.c
@@ -10,6 +10,10 @@
 #include 
 #include 
 
+/* Default to PCIe standard 20 bit PASID */
+#define PCI_PASID_MAX 0x10
+static ioasid_t ioasid_capacity = PCI_PASID_MAX;
+static ioasid_t ioasid_capacity_avail = PCI_PASID_MAX;
 struct ioasid_data {
ioasid_t id;
struct ioasid_set *set;
@@ -258,6 +262,72 @@ void ioasid_unregister_allocator(struct 
ioasid_allocator_ops *ops)
 }
 EXPORT_SYMBOL_GPL(ioasid_unregister_allocator);
 
+void ioasid_install_capacity(ioasid_t total)
+{
+   spin_lock(_allocator_lock);
+   if (ioasid_capacity && ioasid_capacity != PCI_PASID_MAX) {
+   pr_warn("IOASID capacity is already set.\n");
+   goto done_unlock;
+   }
+   ioasid_capacity = ioasid_capacity_avail = total;
+done_unlock:
+   spin_unlock(_allocator_lock);
+}
+EXPORT_SYMBOL_GPL(ioasid_install_capacity);
+
+/**
+ * @brief Reserve capacity from the system pool
+ *
+ * @param nr_ioasid Number of IOASIDs requested to be reserved, 0 means
+ * reserve all remaining IDs.
+ *
+ * @return the remaining capacity on success, or errno
+ */
+int ioasid_reserve_capacity(ioasid_t nr_ioasid)
+{
+   int ret = 0;
+
+   spin_lock(_allocator_lock);
+   if (nr_ioasid > ioasid_capacity_avail) {
+   ret = -ENOSPC;
+   goto done_unlock;
+   }
+   if (!nr_ioasid)
+   nr_ioasid = ioasid_capacity_avail;
+   ioasid_capacity_avail -= nr_ioasid;
+   ret = nr_ioasid;
+done_unlock:
+   spin_unlock(_allocator_lock);
+   return ret;
+}
+EXPORT_SYMBOL_GPL(ioasid_reserve_capacity);
+
+/**
+ * @brief Return capacity to the system pool
+ * We trust the caller not to return more than it has reserved, we could
+ * also track reservation if needed.
+ *
+ * @param nr_ioasid Number of IOASIDs requested to be returned
+ *
+ * @return the remaining capacity on success, or errno
+ */
+int ioasid_cancel_capacity(ioasid_t nr_ioasid)
+{
+   int ret = 0;
+
+   spin_lock(_allocator_lock);
+   if (nr_ioasid + ioasid_capacity_avail > ioasid_capacity) {
+   ret = -EINVAL;
+   goto done_unlock;
+   }
+   ioasid_capacity_avail += nr_ioasid;
+   ret = ioasid_capacity_avail;
+done_unlock:
+   spin_unlock(_allocator_lock);
+   return ret;
+}
+EXPORT_SYMBOL_GPL(ioasid_cancel_capacity);
+
 /**
  * ioasid_attach_data - Set private data for an allocated ioasid
  * @ioasid: the ID to set data
diff --git a/include/linux/ioasid.h b/include/linux/ioasid.h
index f6e705f832f0..2780bdc84b94 100644
--- a/include/linux/ioasid.h
+++ b/include/linux/ioasid.h
@@ -32,6 +32,10 @@ struct ioasid_allocator_ops {
 #define DECLARE_IOASID_SET(name) struct ioasid_set name = { 0 }
 
 #if IS_ENABLED(CONFIG_IOASID)
+void ioasid_install_capacity(ioasid_t total);
+int ioasid_reserve_capacity(ioasid_t nr_ioasid);
+int ioasid_cancel_capacity(ioasid_t nr_ioasid);
+
 ioasid_t ioasid_alloc(struct ioasid_set *set, ioasid_t min, ioasid_t max,
  void *private);
 void ioasid_get(ioasid_t ioasid);
@@ -43,6 +47,20 @@ void ioasid_unregister_allocator(struct ioasid_allocator_ops 
*allocator);
 int ioasid_attach_data(ioasid_t ioasid, void *data);
 void ioasid_detach_data(ioasid_t ioasid);
 #else /* !CONFIG_IOASID */
+static inline void ioasid_install_capacity(ioasid_t total)
+{
+}
+
+static inline int ioasid_reserve_capacity(ioasid_t nr_ioasid)
+{
+   return -ENOSPC;
+}
+
+static inline int ioasid_cancel_capacity(ioasid_t nr_ioasid)
+{
+   return -EINVAL;
+}
+
 static inline ioasid_t ioasid_alloc(struct ioasi

[PATCH V4 03/18] iommu/ioasid: Add a separate function for detach data

2021-02-27 Thread Jacob Pan

IOASID private data can be cleared by ioasid_attach_data() with a NULL
data pointer. A common use case is for a caller to free the data
afterward. ioasid_attach_data() calls synchronize_rcu() before return
such that free data can be sure without outstanding readers.
However, since synchronize_rcu() may sleep, ioasid_attach_data() cannot
be used under spinlocks.

This patch adds ioasid_detach_data() as a separate API where
synchronize_rcu() is called only in this case. ioasid_attach_data() can
then be used under spinlocks. In addition, this change makes the API
symmetrical.

Signed-off-by: Jacob Pan 
---
 drivers/iommu/intel/svm.c |  4 +--
 drivers/iommu/ioasid.c| 54 +++
 include/linux/ioasid.h|  5 +++-
 3 files changed, 50 insertions(+), 13 deletions(-)

diff --git a/drivers/iommu/intel/svm.c b/drivers/iommu/intel/svm.c
index 0053df9edffc..68372a7eb8b5 100644
--- a/drivers/iommu/intel/svm.c
+++ b/drivers/iommu/intel/svm.c
@@ -425,7 +425,7 @@ int intel_svm_bind_gpasid(struct iommu_domain *domain, 
struct device *dev,
list_add_rcu(>list, >devs);
  out:
if (!IS_ERR_OR_NULL(svm) && list_empty(>devs)) {
-   ioasid_attach_data(data->hpasid, NULL);
+   ioasid_detach_data(data->hpasid);
kfree(svm);
}
 
@@ -468,7 +468,7 @@ int intel_svm_unbind_gpasid(struct device *dev, u32 pasid)
 * the unbind, IOMMU driver will get notified
 * and perform cleanup.
 */
-   ioasid_attach_data(pasid, NULL);
+   ioasid_detach_data(pasid);
kfree(svm);
}
}
diff --git a/drivers/iommu/ioasid.c b/drivers/iommu/ioasid.c
index eeadf4586e0a..4eb9b3dd1b85 100644
--- a/drivers/iommu/ioasid.c
+++ b/drivers/iommu/ioasid.c
@@ -273,23 +273,57 @@ int ioasid_attach_data(ioasid_t ioasid, void *data)
 
spin_lock(_allocator_lock);
ioasid_data = xa_load(_allocator->xa, ioasid);
-   if (ioasid_data)
-   rcu_assign_pointer(ioasid_data->private, data);
-   else
+
+   if (!ioasid_data) {
ret = -ENOENT;
-   spin_unlock(_allocator_lock);
+   goto done_unlock;
+   }
 
-   /*
-* Wait for readers to stop accessing the old private data, so the
-* caller can free it.
-*/
-   if (!ret)
-   synchronize_rcu();
+   if (ioasid_data->private) {
+   ret = -EBUSY;
+   goto done_unlock;
+   }
+   rcu_assign_pointer(ioasid_data->private, data);
+
+done_unlock:
+   spin_unlock(_allocator_lock);
 
return ret;
 }
 EXPORT_SYMBOL_GPL(ioasid_attach_data);
 
+/**
+ * ioasid_detach_data - Clear the private data of an ioasid
+ *
+ * @ioasid: the IOASIDD to clear private data
+ */
+void ioasid_detach_data(ioasid_t ioasid)
+{
+   struct ioasid_data *ioasid_data;
+
+   spin_lock(_allocator_lock);
+   ioasid_data = xa_load(_allocator->xa, ioasid);
+
+   if (!ioasid_data) {
+   pr_warn("IOASID %u not found to detach data from\n", ioasid);
+   goto done_unlock;
+   }
+
+   if (ioasid_data->private) {
+   rcu_assign_pointer(ioasid_data->private, NULL);
+   goto done_unlock;
+   }
+
+done_unlock:
+   spin_unlock(_allocator_lock);
+   /*
+* Wait for readers to stop accessing the old private data,
+* so the caller can free it.
+*/
+   synchronize_rcu();
+}
+EXPORT_SYMBOL_GPL(ioasid_detach_data);
+
 /**
  * ioasid_alloc - Allocate an IOASID
  * @set: the IOASID set
diff --git a/include/linux/ioasid.h b/include/linux/ioasid.h
index 60ea279802b8..f6e705f832f0 100644
--- a/include/linux/ioasid.h
+++ b/include/linux/ioasid.h
@@ -41,7 +41,7 @@ void *ioasid_find(struct ioasid_set *set, ioasid_t ioasid,
 int ioasid_register_allocator(struct ioasid_allocator_ops *allocator);
 void ioasid_unregister_allocator(struct ioasid_allocator_ops *allocator);
 int ioasid_attach_data(ioasid_t ioasid, void *data);
-
+void ioasid_detach_data(ioasid_t ioasid);
 #else /* !CONFIG_IOASID */
 static inline ioasid_t ioasid_alloc(struct ioasid_set *set, ioasid_t min,
ioasid_t max, void *private)
@@ -78,5 +78,8 @@ static inline int ioasid_attach_data(ioasid_t ioasid, void 
*data)
return -ENOTSUPP;
 }
 
+static inline void ioasid_detach_data(ioasid_t ioasid)
+{
+}
 #endif /* CONFIG_IOASID */
 #endif /* __LINUX_IOASID_H */
-- 
2.25.1

Re: [PATCH 2/4] iommu/vt-d: Enable write protect propagation from guest

2021-02-22 Thread Jacob Pan

Hi Kevin,

On Sat, 20 Feb 2021 02:38:02 +, "Tian, Kevin" 
wrote:

> > From: Jacob Pan 
> > Sent: Saturday, February 20, 2021 1:09 AM
> > 
> > Hi Kevin,
> > 
> > On Fri, 19 Feb 2021 06:19:04 +, "Tian, Kevin" 
> > wrote:
> >   
> > > > From: Jacob Pan 
> > > > Sent: Friday, February 19, 2021 5:31 AM
> > > >
> > > > Write protect bit, when set, inhibits supervisor writes to the
> > > > read-only pages. In guest supervisor shared virtual addressing
> > > > (SVA), write-protect should be honored upon guest bind supervisor
> > > > PASID request.
> > > >
> > > > This patch extends the VT-d portion of the IOMMU UAPI to include WP
> > > > bit. WPE bit of the  supervisor PASID entry will be set to match
> > > > CPU CR0.WP bit.
> > > >
> > > > Signed-off-by: Sanjay Kumar 
> > > > Signed-off-by: Jacob Pan 
> > > > ---
> > > >  drivers/iommu/intel/pasid.c | 5 +
> > > >  include/uapi/linux/iommu.h  | 3 ++-
> > > >  2 files changed, 7 insertions(+), 1 deletion(-)
> > > >
> > > > diff --git a/drivers/iommu/intel/pasid.c
> > > > b/drivers/iommu/intel/pasid.c index 0b7e0e726ade..c7a2ec930af4
> > > > 100644 --- a/drivers/iommu/intel/pasid.c
> > > > +++ b/drivers/iommu/intel/pasid.c
> > > > @@ -763,6 +763,11 @@ intel_pasid_setup_bind_data(struct  
> > intel_iommu  
> > > > *iommu, struct pasid_entry *pte,
> > > > return -EINVAL;
> > > > }
> > > > pasid_set_sre(pte);
> > > > +   /* Enable write protect WP if guest requested */
> > > > +   if (pasid_data->flags & IOMMU_SVA_VTD_GPASID_WPE) {
> > > > +   if (pasid_enable_wpe(pte))
> > > > +   return -EINVAL;  
> > >
> > > We should call pasid_set_wpe directly, as this binding is about guest
> > > page table and suppose the guest has done whatever check required
> > > (e.g. gcr0.wp) before setting this bit. pasid_enable_wpe has an
> > > additional check on host cr0.wp thus is logically incorrect here.
> > >  
> > If the host CPU does not support WP, can guest VCPU still support WP? If
> > so, I agree.
> >   
> 
> If you change 'support' to 'enable', then the answer is yes.

I agree, thanks for explaining. Will change it to pasid_set_wpe.

Thanks,

Jacob

Re: [PATCH 1/4] iommu/vt-d: Enable write protect for supervisor SVM

2021-02-22 Thread Jacob Pan

Hi Lu,

On Sat, 20 Feb 2021 09:56:26 +0800, Lu Baolu 
wrote:

> Hi Jacob and Sanjay,
> 
> On 2/19/21 5:31 AM, Jacob Pan wrote:
> > Write protect bit, when set, inhibits supervisor writes to the read-only
> > pages. In supervisor shared virtual addressing (SVA), where page tables
> > are shared between CPU and DMA, IOMMU PASID entry WPE bit should match
> > CR0.WP bit in the CPU.
> > This patch sets WPE bit for supervisor PASIDs if CR0.WP is set.  
> 
>  From reading the commit message, the intention of this patch is to match
> PASID entry WPE bith with CPU CR0.WP if 1) SRE is set (supervisor
> pasid); 2) page table is shared between CPU and IOMMU. Do I understand
> it right?
> 
yes. that is my intention.

> But what the real code doing is failing pasid entry setup for first
> level translation if CPU CR0.WP is not set. It's not consistent with
> what described above.
> 
> What I am thinking is that, as long as SRE is set, we should always set
> WPE in intel_pasid_setup_first_level(). For supervisor SVA case, we
> should check CPU CR0.WP in intel_svm_bind_mm() and abort binding if
> CR0.WP is not set.
> 
> Thought?
> 
This code only affects supervisor SVA, since PASID_FLAG_SUPERVISOR_MODE
flag is not set for FL IOVA.

> Best regards,
> baolu
> 
> > 
> > Signed-off-by: Sanjay Kumar 
> > Signed-off-by: Jacob Pan 
> > ---
> >   drivers/iommu/intel/pasid.c | 26 ++
> >   1 file changed, 26 insertions(+)
> > 
> > diff --git a/drivers/iommu/intel/pasid.c b/drivers/iommu/intel/pasid.c
> > index 0cceaabc3ce6..0b7e0e726ade 100644
> > --- a/drivers/iommu/intel/pasid.c
> > +++ b/drivers/iommu/intel/pasid.c
> > @@ -410,6 +410,15 @@ static inline void pasid_set_sre(struct
> > pasid_entry *pe) pasid_set_bits(>val[2], 1 << 0, 1);
> >   }
> >   
> > +/*
> > + * Setup the WPE(Write Protect Enable) field (Bit 132) of a
> > + * scalable mode PASID entry.
> > + */
> > +static inline void pasid_set_wpe(struct pasid_entry *pe)
> > +{
> > +   pasid_set_bits(>val[2], 1 << 4, 1 << 4);
> > +}
> > +
> >   /*
> >* Setup the P(Present) field (Bit 0) of a scalable mode PASID
> >* entry.
> > @@ -553,6 +562,20 @@ static void pasid_flush_caches(struct intel_iommu
> > *iommu, }
> >   }
> >   
> > +static inline int pasid_enable_wpe(struct pasid_entry *pte)
> > +{
> > +   unsigned long cr0 = read_cr0();
> > +
> > +   /* CR0.WP is normally set but just to be sure */
> > +   if (unlikely(!(cr0 & X86_CR0_WP))) {
> > +   pr_err_ratelimited("No CPU write protect!\n");
> > +   return -EINVAL;
> > +   }
> > +   pasid_set_wpe(pte);
> > +
> > +   return 0;
> > +};
> > +
> >   /*
> >* Set up the scalable mode pasid table entry for first only
> >* translation type.
> > @@ -584,6 +607,9 @@ int intel_pasid_setup_first_level(struct
> > intel_iommu *iommu, return -EINVAL;
> > }
> > pasid_set_sre(pte);
> > +   if (pasid_enable_wpe(pte))
> > +   return -EINVAL;
> > +
> > }
> >   
> > if (flags & PASID_FLAG_FL5LP) {
> >   


Thanks,

Jacob

Re: [PATCH 2/4] iommu/vt-d: Enable write protect propagation from guest

2021-02-19 Thread Jacob Pan

Hi Kevin,

On Fri, 19 Feb 2021 06:19:04 +, "Tian, Kevin" 
wrote:

> > From: Jacob Pan 
> > Sent: Friday, February 19, 2021 5:31 AM
> > 
> > Write protect bit, when set, inhibits supervisor writes to the read-only
> > pages. In guest supervisor shared virtual addressing (SVA),
> > write-protect should be honored upon guest bind supervisor PASID
> > request.
> > 
> > This patch extends the VT-d portion of the IOMMU UAPI to include WP bit.
> > WPE bit of the  supervisor PASID entry will be set to match CPU CR0.WP
> > bit.
> > 
> > Signed-off-by: Sanjay Kumar 
> > Signed-off-by: Jacob Pan 
> > ---
> >  drivers/iommu/intel/pasid.c | 5 +
> >  include/uapi/linux/iommu.h  | 3 ++-
> >  2 files changed, 7 insertions(+), 1 deletion(-)
> > 
> > diff --git a/drivers/iommu/intel/pasid.c b/drivers/iommu/intel/pasid.c
> > index 0b7e0e726ade..c7a2ec930af4 100644
> > --- a/drivers/iommu/intel/pasid.c
> > +++ b/drivers/iommu/intel/pasid.c
> > @@ -763,6 +763,11 @@ intel_pasid_setup_bind_data(struct intel_iommu
> > *iommu, struct pasid_entry *pte,
> > return -EINVAL;
> > }
> > pasid_set_sre(pte);
> > +   /* Enable write protect WP if guest requested */
> > +   if (pasid_data->flags & IOMMU_SVA_VTD_GPASID_WPE) {
> > +   if (pasid_enable_wpe(pte))
> > +   return -EINVAL;  
> 
> We should call pasid_set_wpe directly, as this binding is about guest
> page table and suppose the guest has done whatever check required
> (e.g. gcr0.wp) before setting this bit. pasid_enable_wpe has an
> additional check on host cr0.wp thus is logically incorrect here.
> 
If the host CPU does not support WP, can guest VCPU still support WP? If
so, I agree.

> Thanks
> Kevin
> 
> > +   }
> > }
> > 
> > if (pasid_data->flags & IOMMU_SVA_VTD_GPASID_EAFE) {
> > diff --git a/include/uapi/linux/iommu.h b/include/uapi/linux/iommu.h
> > index 68cb558fe8db..33f3dc7a91de 100644
> > --- a/include/uapi/linux/iommu.h
> > +++ b/include/uapi/linux/iommu.h
> > @@ -288,7 +288,8 @@ struct iommu_gpasid_bind_data_vtd {
> >  #define IOMMU_SVA_VTD_GPASID_PWT   (1 << 3) /* page-level write
> > through */
> >  #define IOMMU_SVA_VTD_GPASID_EMTE  (1 << 4) /* extended mem
> > type enable */
> >  #define IOMMU_SVA_VTD_GPASID_CD(1 << 5) /* PASID-level
> > cache disable */
> > -#define IOMMU_SVA_VTD_GPASID_LAST  (1 << 6)
> > +#define IOMMU_SVA_VTD_GPASID_WPE   (1 << 6) /* Write protect
> > enable */
> > +#define IOMMU_SVA_VTD_GPASID_LAST  (1 << 7)
> > __u64 flags;
> > __u32 pat;
> > __u32 emt;
> > --
> > 2.25.1  
> 


Thanks,

Jacob

[PATCH 4/4] iommu/vt-d: Calculate and set flags for handle_mm_fault

2021-02-18 Thread Jacob Pan

Page requests are originated from the user page fault. Therefore, we
shall set FAULT_FLAG_USER. 

FAULT_FLAG_REMOTE indicates that we are walking an mm which is not
guaranteed to be the same as the current->mm and should not be subject
to protection key enforcement. Therefore, we should set FAULT_FLAG_REMOTE
to avoid faults when both SVM and PKEY are used.

References: commit 1b2ee1266ea6 ("mm/core: Do not enforce PKEY permissions on 
remote mm access")
Reviewed-by: Raj Ashok 
Signed-off-by: Jacob Pan 
---
 drivers/iommu/intel/svm.c | 9 ++---
 1 file changed, 6 insertions(+), 3 deletions(-)

diff --git a/drivers/iommu/intel/svm.c b/drivers/iommu/intel/svm.c
index ff7ae7cc17d5..7bfd20a24a60 100644
--- a/drivers/iommu/intel/svm.c
+++ b/drivers/iommu/intel/svm.c
@@ -1086,6 +1086,7 @@ static irqreturn_t prq_event_thread(int irq, void *d)
struct intel_iommu *iommu = d;
struct intel_svm *svm = NULL;
int head, tail, handled = 0;
+   unsigned int flags = 0;
 
/* Clear PPR bit before reading head/tail registers, to
 * ensure that we get a new interrupt if needed. */
@@ -1186,9 +1187,11 @@ static irqreturn_t prq_event_thread(int irq, void *d)
if (access_error(vma, req))
goto invalid;
 
-   ret = handle_mm_fault(vma, address,
- req->wr_req ? FAULT_FLAG_WRITE : 0,
- NULL);
+   flags = FAULT_FLAG_USER | FAULT_FLAG_REMOTE;
+   if (req->wr_req)
+   flags |= FAULT_FLAG_WRITE;
+
+   ret = handle_mm_fault(vma, address, flags, NULL);
if (ret & VM_FAULT_ERROR)
goto invalid;
 
-- 
2.25.1

[PATCH 3/4] iommu/vt-d: Reject unsupported page request modes

2021-02-18 Thread Jacob Pan

When supervisor/privilige mode SVM is used, we bind init_mm.pgd with
a supervisor PASID. There should not be any page fault for init_mm.
Execution request with DMA read is also not supported.

This patch checks PRQ descriptor for both unsupported configurations,
reject them both with invalid responses.

Signed-off-by: Jacob Pan 
---
 drivers/iommu/intel/svm.c | 12 +++-
 1 file changed, 11 insertions(+), 1 deletion(-)

diff --git a/drivers/iommu/intel/svm.c b/drivers/iommu/intel/svm.c
index 23a1e4f58c54..ff7ae7cc17d5 100644
--- a/drivers/iommu/intel/svm.c
+++ b/drivers/iommu/intel/svm.c
@@ -1113,7 +1113,17 @@ static irqreturn_t prq_event_thread(int irq, void *d)
   ((unsigned long long *)req)[1]);
goto no_pasid;
}
-
+   /* We shall not receive page request for supervisor SVM */
+   if (req->pm_req && (req->rd_req | req->wr_req)) {
+   pr_err("Unexpected page request in Privilege Mode");
+   /* No need to find the matching sdev as for bad_req */
+   goto no_pasid;
+   }
+   /* DMA read with exec requeset is not supported. */
+   if (req->exe_req && req->rd_req) {
+   pr_err("Execution request not supported\n");
+   goto no_pasid;
+   }
if (!svm || svm->pasid != req->pasid) {
rcu_read_lock();
svm = ioasid_find(NULL, req->pasid, NULL);
-- 
2.25.1

[PATCH 1/4] iommu/vt-d: Enable write protect for supervisor SVM

2021-02-18 Thread Jacob Pan

Write protect bit, when set, inhibits supervisor writes to the read-only
pages. In supervisor shared virtual addressing (SVA), where page tables
are shared between CPU and DMA, IOMMU PASID entry WPE bit should match
CR0.WP bit in the CPU.
This patch sets WPE bit for supervisor PASIDs if CR0.WP is set.

Signed-off-by: Sanjay Kumar 
Signed-off-by: Jacob Pan 
---
 drivers/iommu/intel/pasid.c | 26 ++
 1 file changed, 26 insertions(+)

diff --git a/drivers/iommu/intel/pasid.c b/drivers/iommu/intel/pasid.c
index 0cceaabc3ce6..0b7e0e726ade 100644
--- a/drivers/iommu/intel/pasid.c
+++ b/drivers/iommu/intel/pasid.c
@@ -410,6 +410,15 @@ static inline void pasid_set_sre(struct pasid_entry *pe)
pasid_set_bits(>val[2], 1 << 0, 1);
 }
 
+/*
+ * Setup the WPE(Write Protect Enable) field (Bit 132) of a
+ * scalable mode PASID entry.
+ */
+static inline void pasid_set_wpe(struct pasid_entry *pe)
+{
+   pasid_set_bits(>val[2], 1 << 4, 1 << 4);
+}
+
 /*
  * Setup the P(Present) field (Bit 0) of a scalable mode PASID
  * entry.
@@ -553,6 +562,20 @@ static void pasid_flush_caches(struct intel_iommu *iommu,
}
 }
 
+static inline int pasid_enable_wpe(struct pasid_entry *pte)
+{
+   unsigned long cr0 = read_cr0();
+
+   /* CR0.WP is normally set but just to be sure */
+   if (unlikely(!(cr0 & X86_CR0_WP))) {
+   pr_err_ratelimited("No CPU write protect!\n");
+   return -EINVAL;
+   }
+   pasid_set_wpe(pte);
+
+   return 0;
+};
+
 /*
  * Set up the scalable mode pasid table entry for first only
  * translation type.
@@ -584,6 +607,9 @@ int intel_pasid_setup_first_level(struct intel_iommu *iommu,
return -EINVAL;
}
pasid_set_sre(pte);
+   if (pasid_enable_wpe(pte))
+   return -EINVAL;
+
}
 
if (flags & PASID_FLAG_FL5LP) {
-- 
2.25.1

[PATCH 0/4] Misc vSVA fixes for VT-d

2021-02-18 Thread Jacob Pan

Hi Baolu et al,

This is a collection of SVA-related fixes.

Thanks,
Jacob


Jacob Pan (4):
  iommu/vt-d: Enable write protect for supervisor SVM
  iommu/vt-d: Enable write protect propagation from guest
  iommu/vt-d: Reject unsupported page request modes
  iommu/vt-d: Calculate and set flags for handle_mm_fault

 drivers/iommu/intel/pasid.c | 31 +++
 drivers/iommu/intel/svm.c   | 21 +
 include/uapi/linux/iommu.h  |  3 ++-
 3 files changed, 50 insertions(+), 5 deletions(-)

-- 
2.25.1

[PATCH 2/4] iommu/vt-d: Enable write protect propagation from guest

2021-02-18 Thread Jacob Pan

Write protect bit, when set, inhibits supervisor writes to the read-only
pages. In guest supervisor shared virtual addressing (SVA), write-protect
should be honored upon guest bind supervisor PASID request.

This patch extends the VT-d portion of the IOMMU UAPI to include WP bit.
WPE bit of the  supervisor PASID entry will be set to match CPU CR0.WP bit.

Signed-off-by: Sanjay Kumar 
Signed-off-by: Jacob Pan 
---
 drivers/iommu/intel/pasid.c | 5 +
 include/uapi/linux/iommu.h  | 3 ++-
 2 files changed, 7 insertions(+), 1 deletion(-)

diff --git a/drivers/iommu/intel/pasid.c b/drivers/iommu/intel/pasid.c
index 0b7e0e726ade..c7a2ec930af4 100644
--- a/drivers/iommu/intel/pasid.c
+++ b/drivers/iommu/intel/pasid.c
@@ -763,6 +763,11 @@ intel_pasid_setup_bind_data(struct intel_iommu *iommu, 
struct pasid_entry *pte,
return -EINVAL;
}
pasid_set_sre(pte);
+   /* Enable write protect WP if guest requested */
+   if (pasid_data->flags & IOMMU_SVA_VTD_GPASID_WPE) {
+   if (pasid_enable_wpe(pte))
+   return -EINVAL;
+   }
}
 
if (pasid_data->flags & IOMMU_SVA_VTD_GPASID_EAFE) {
diff --git a/include/uapi/linux/iommu.h b/include/uapi/linux/iommu.h
index 68cb558fe8db..33f3dc7a91de 100644
--- a/include/uapi/linux/iommu.h
+++ b/include/uapi/linux/iommu.h
@@ -288,7 +288,8 @@ struct iommu_gpasid_bind_data_vtd {
 #define IOMMU_SVA_VTD_GPASID_PWT   (1 << 3) /* page-level write through */
 #define IOMMU_SVA_VTD_GPASID_EMTE  (1 << 4) /* extended mem type enable */
 #define IOMMU_SVA_VTD_GPASID_CD(1 << 5) /* PASID-level cache 
disable */
-#define IOMMU_SVA_VTD_GPASID_LAST  (1 << 6)
+#define IOMMU_SVA_VTD_GPASID_WPE   (1 << 6) /* Write protect enable */
+#define IOMMU_SVA_VTD_GPASID_LAST  (1 << 7)
__u64 flags;
__u32 pat;
__u32 emt;
-- 
2.25.1

Re: [PATCH v13 01/15] iommu: Introduce attach/detach_pasid_table API

2020-11-18 Thread Jacob Pan

Hi Eric,

On Wed, 18 Nov 2020 12:21:37 +0100, Eric Auger 
wrote:

> In virtualization use case, when a guest is assigned
> a PCI host device, protected by a virtual IOMMU on the guest,
> the physical IOMMU must be programmed to be consistent with
> the guest mappings. If the physical IOMMU supports two
> translation stages it makes sense to program guest mappings
> onto the first stage/level (ARM/Intel terminology) while the host
> owns the stage/level 2.
> 
> In that case, it is mandated to trap on guest configuration
> settings and pass those to the physical iommu driver.
> 
> This patch adds a new API to the iommu subsystem that allows
> to set/unset the pasid table information.
> 
> A generic iommu_pasid_table_config struct is introduced in
> a new iommu.h uapi header. This is going to be used by the VFIO
> user API.
> 
> Signed-off-by: Jean-Philippe Brucker 
> Signed-off-by: Liu, Yi L 
> Signed-off-by: Ashok Raj 
> Signed-off-by: Jacob Pan 
> Signed-off-by: Eric Auger 
> 
> ---
> 
> v12 -> v13:
> - Fix config check
> 
> v11 -> v12:
> - add argsz, name the union
> ---
>  drivers/iommu/iommu.c  | 68 ++
>  include/linux/iommu.h  | 21 
>  include/uapi/linux/iommu.h | 54 ++
>  3 files changed, 143 insertions(+)
> 
> diff --git a/drivers/iommu/iommu.c b/drivers/iommu/iommu.c
> index b53446bb8c6b..978fe34378fb 100644
> --- a/drivers/iommu/iommu.c
> +++ b/drivers/iommu/iommu.c
> @@ -2171,6 +2171,74 @@ int iommu_uapi_sva_unbind_gpasid(struct
> iommu_domain *domain, struct device *dev }
>  EXPORT_SYMBOL_GPL(iommu_uapi_sva_unbind_gpasid);
>  
> +int iommu_attach_pasid_table(struct iommu_domain *domain,
> +  struct iommu_pasid_table_config *cfg)
> +{
> + if (unlikely(!domain->ops->attach_pasid_table))
> + return -ENODEV;
> +
> + return domain->ops->attach_pasid_table(domain, cfg);
> +}
> +
> +int iommu_uapi_attach_pasid_table(struct iommu_domain *domain,
> +   void __user *uinfo)
> +{
> + struct iommu_pasid_table_config pasid_table_data = { 0 };
> + u32 minsz;
> +
> + if (unlikely(!domain->ops->attach_pasid_table))
> + return -ENODEV;
> +
> + /*
> +  * No new spaces can be added before the variable sized union,
> the
> +  * minimum size is the offset to the union.
> +  */
> + minsz = offsetof(struct iommu_pasid_table_config, vendor_data);
> +
> + /* Copy minsz from user to get flags and argsz */
> + if (copy_from_user(_table_data, uinfo, minsz))
> + return -EFAULT;
> +
> + /* Fields before the variable size union are mandatory */
> + if (pasid_table_data.argsz < minsz)
> + return -EINVAL;
> +
> + /* PASID and address granu require additional info beyond minsz
> */
> + if (pasid_table_data.version != PASID_TABLE_CFG_VERSION_1)
> + return -EINVAL;
> + if (pasid_table_data.format == IOMMU_PASID_FORMAT_SMMUV3 &&
> + pasid_table_data.argsz <
> + offsetofend(struct iommu_pasid_table_config,
> vendor_data.smmuv3))
> + return -EINVAL;
> +
> + /*
> +  * User might be using a newer UAPI header which has a larger
> data
> +  * size, we shall support the existing flags within the current
> +  * size. Copy the remaining user data _after_ minsz but not more
> +  * than the current kernel supported size.
> +  */
> + if (copy_from_user((void *)_table_data + minsz, uinfo +
> minsz,
> +min_t(u32, pasid_table_data.argsz,
> sizeof(pasid_table_data)) - minsz))
> + return -EFAULT;
> +
> + /* Now the argsz is validated, check the content */
> + if (pasid_table_data.config < IOMMU_PASID_CONFIG_TRANSLATE ||
> + pasid_table_data.config > IOMMU_PASID_CONFIG_ABORT)
> + return -EINVAL;
> +
> + return domain->ops->attach_pasid_table(domain,
> _table_data); +}
> +EXPORT_SYMBOL_GPL(iommu_uapi_attach_pasid_table);
> +
> +void iommu_detach_pasid_table(struct iommu_domain *domain)
> +{
> + if (unlikely(!domain->ops->detach_pasid_table))
> + return;
> +
> + domain->ops->detach_pasid_table(domain);
> +}
> +EXPORT_SYMBOL_GPL(iommu_detach_pasid_table);
> +
>  static void __iommu_detach_device(struct iommu_domain *domain,
> struct device *dev)
>  {
> diff --git a/include/linux/iommu.h b/include/linux/iommu.h
> index b95a6f8db6ff..464fcbecf841 100644
> --- a/include/linux/iommu.h
> ++

Re: [PATCH v3 01/14] docs: Document IO Address Space ID (IOASID) APIs

2020-11-02 Thread Jacob Pan

Hi Jean-Philippe,

On Fri, 30 Oct 2020 11:18:27 +0100, Jean-Philippe Brucker
 wrote:

> On Mon, Oct 26, 2020 at 02:05:06PM -0700, Jacob Pan wrote:
> > > This looks good to me, with small comments below.
> > >   
> > Can I add your Reviewed-by tag after addressing the comments?  
> 
> Yes sure, this took forever to review so I'm happy not to do another
> pass :)
> 
I am afraid I have to ask for another round of reviews since it was
suggested to keep IOASID allocation interface independent, instead of being
part of VFIO UAPI. Yi and I are working out the details to come up with a
PoC. As you might be aware, the need for this independent interface is that
we may have multiple users of PASID, e.g VDPA, user space drivers, etc.
The IOASID user interface also has slight impact on the IOASID core code,
which is why I am slow in response to your code review. Will incorporate
your review in the next round with support of independent user API.
Much appreciated!

> 
> > > > +Each IOASID set is created with a token, which can be one of the
> > > > +following token types:
> > > > +
> > > > + - IOASID_SET_TYPE_NULL (Arbitrary u64 value)
> > > 
> > > Maybe NULL isn't the best name then. NONE?
> > >   
> > Agreed, 'NONE' makes more sense.  
> 
> Although patch 5 only allows a NULL token for this type. So the name seems
> fine, you could just fix this description.
> 
OK.

> 
> > > > +IOASID core has the notion of "custom allocator" such that guest
> > > > can +register virtual command allocator that precedes the default
> > > > one.
> > > 
> > > "Supersedes", rather than "precedes"?
> > >   
> > My understanding is that 'supersede' means replace something but
> > 'precede' means get in front of something. I do want to emphasis that
> > the custom allocator takes precedence over the default allocator.  
> 
> Right it's ambiguous. The custom allocator does entirely replace the
> allocation action, but the default one is still used for storage. Anyway,
> you can leave this.
> 
OK

> 
> > > > +Let's examine the IOASID life cycle again when free happens
> > > > *before* +unbind. This could be a result of misbehaving guests or
> > > > crash. Assuming +VFIO cannot enforce unbind->free order. Notice
> > > > that the setup part up +until step #12 is identical to the normal
> > > > case, the flow below starts +with step 13.
> > > > +
> > > > +::
> > > > +
> > > > + VFIOIOMMUKVMVDCMIOASID
> > > > Ref
> > > > +   ..
> > > > +   13  GUEST STARTS DMA --
> > > > +   14  *GUEST MISBEHAVES!!!* 
> > > > +   15 ioasid_free()
> > > > +   16
> > > > ioasid_notify(FREE)
> > > > +   17 mark_free_pending
> > > > (1)
> > > 
> > > Could we use superscript ¹²³⁴ for footnotes? These look like function
> > > parameters
> > >   
> > yes, much better
> >   
> > > > +   18  kvm_nb_handler(FREE)
> > > > +   19  vmcs_update_atomic()
> > > > +   20  ioasid_put_locked()   ->   3
> > > > +   21   vdcm_nb_handler(FREE)
> > > > +   22iomm_nb_handler(FREE)
> > > 
> > > iommu_nb_handler
> > >   
> > got it
> >   
> > > > +   23 ioasid_free() returns(2)  schedule_work()
> > > > 2
> > > 
> > > I completely lost track here, couldn't figure out in which direction
> > > to read the diagram. What work is scheduled?  
> > The time line goes downward but we only control the notification order
> > in terms of when the events are received. Some completions are async
> > thus out of order done by work items. The only in-order completion is
> > the KVM update of its PASID translation table.
> > 
> > After #23, the async works are scheduled to complete clean up work
> > outside the spinlock(held by the caller of the atomic notifier).
> > 
> > Any suggestions to improve the readability of the time line?  
> 
> Maybe explain what happens from line 23: ioasid_free() schedules... a FREE
> notification? Which happens on line 24 (corresponding to the second
> schedule_work()?)

Re: [PATCH v3 01/14] docs: Document IO Address Space ID (IOASID) APIs

2020-10-26 Thread Jacob Pan

Hi Jean-Philippe,

Thanks a lot for the review. Comments inline.

On Tue, 20 Oct 2020 15:58:09 +0200, Jean-Philippe Brucker
 wrote:

> On Mon, Sep 28, 2020 at 02:38:28PM -0700, Jacob Pan wrote:
> > IOASID is used to identify address spaces that can be targeted by device
> > DMA. It is a system-wide resource that is essential to its many users.
> > This document is an attempt to help developers from all vendors navigate
> > the APIs. At this time, ARM SMMU and Intel’s Scalable IO Virtualization
> > (SIOV) enabled platforms are the primary users of IOASID. Examples of
> > how SIOV components interact with IOASID APIs are provided in that many
> > APIs are driven by the requirements from SIOV.
> > 
> > Cc: Jonathan Corbet 
> > Cc: linux-...@vger.kernel.org
> > Cc: Randy Dunlap 
> > Signed-off-by: Liu Yi L 
> > Signed-off-by: Wu Hao 
> > Signed-off-by: Jacob Pan   
> 
> This looks good to me, with small comments below.
> 
Can I add your Reviewed-by tag after addressing the comments?

> > ---
> >  Documentation/driver-api/ioasid.rst | 648
> >  1 file changed, 648 insertions(+)
> >  create mode 100644 Documentation/driver-api/ioasid.rst
> > 
> > diff --git a/Documentation/driver-api/ioasid.rst
> > b/Documentation/driver-api/ioasid.rst new file mode 100644
> > index ..7f8e702997ab
> > --- /dev/null
> > +++ b/Documentation/driver-api/ioasid.rst
> > @@ -0,0 +1,648 @@
> > +.. SPDX-License-Identifier: GPL-2.0
> > +.. ioasid:
> > +
> > +===
> > +IO Address Space ID
> > +===
> > +
> > +IOASID is a generic name for PCIe Process Address ID (PASID) or ARM
> > +SMMU SubstreamID. An IOASID identifies an address space that DMA
> > +requests can target.
> > +
> > +The primary use cases for IOASID are Shared Virtual Address (SVA) and
> > +multiple IOVA spaces per device. However, the requirements for IOASID
> > +management can vary among hardware architectures.
> > +
> > +For baremetal IOVA, IOASID #0 is used for DMA request without  
> 
>bare metal
got it

> 
> > +PASID. Even though some architectures such as VT-d also offers
> > +the flexibility of using any PASIDs for DMA request without PASID.
> > +PASID #0 is reserved and not allocated from any ioasid_set.
> > +
> > +Multiple IOVA spaces per device are mapped to auxiliary domains which
> > +can be used for mediated device assignment with and without a virtual
> > +IOMMU (vIOMMU). An IOASID is allocated for each auxiliary domain as
> > default +PASID. Without vIOMMU, default IOASID is used for DMA map/unmap
> > +APIs. With vIOMMU, default IOASID is used for guest IOVA where DMA
> > +request with PASID is required for the device. The reason is that
> > +there is only one PASID #0 per device, e.g. VT-d, RID_PASID is per
> > PCI  
> 
>on VT-d
got that
> 
> > +device.
> > +
> > +This document covers the generic features supported by IOASID
> > +APIs. Vendor-specific use cases are also illustrated with Intel's VT-d
> > +based platforms as the first example.
> > +
> > +.. contents:: :local:
> > +
> > +Glossary
> > +
> > +PASID - Process Address Space ID
> > +
> > +IOASID - IO Address Space ID (generic term for PCIe PASID and
> > +SubstreamID in SMMU)
> > +
> > +SVA/SVM - Shared Virtual Addressing/Memory
> > +
> > +ENQCMD - Intel X86 ISA for efficient workqueue submission [1]
> > +!!!TODO: Link to Spec at the bottom  
> 
> Yes, or maybe hyperlinks at the end of this section would be better. There
> are references and lists all over the document so keeping things as close
> as possible avoids confusion.
> 
sounds good.

> > +
> > +DSA - Intel Data Streaming Accelerator [2]
> > +
> > +VDCM - Virtual Device Composition Module [3]
> > +
> > +SIOV - Intel Scalable IO Virtualization
> > +
> > +
> > +Key Concepts
> > +
> > +
> > +IOASID Set
> > +---
> > +An IOASID set is a group of IOASIDs allocated from the system-wide
> > +IOASID pool. Refer to IOASID set APIs for more details.  
> 
>  IOASID Set Level APIs
> 
Yes, I should use the exact section title.

> > +
> > +IOASID set is particularly useful for guest SVA where each guest could
> > +have its own IOASID set for security and efficiency reasons.
> > +
> > +IOASID Set Private ID (SPID)
> > +
> > +Each IOASID

Re: [RFC PATCH 0/2] iommu: Avoid unnecessary PRI queue flushes

2020-10-19 Thread Jacob Pan

Hi Jean-Philippe,

On Mon, 19 Oct 2020 16:08:24 +0200, Jean-Philippe Brucker
 wrote:

> On Sat, Oct 17, 2020 at 04:25:25AM -0700, Raj, Ashok wrote:
> > > For devices that *don't* use a stop marker, the PCIe spec says
> > > (10.4.1.2):
> > > 
> > >   To stop [using a PASID] without using a Stop Marker Message, the
> > >   function shall:
> > >   1. Stop queueing new Page Request Messages for this PASID.  
> > 
> > The device driver would need to tell stop sending any new PR's.
> >   
> > >   2. Finish transmitting any multi-page Page Request Messages for this
> > >  PASID (i.e. send the Page Request Message with the L bit Set).
> > >   3. Wait for PRG Response Messages associated any outstanding Page
> > >  Request Messages for the PASID.
> > > 
> > > So they have to flush their PR themselves. And since the device driver
> > > completes this sequence before calling unbind(), then there shouldn't
> > > be any oustanding PR for the PASID, and unbind() doesn't need to
> > > flush, right?  
> > 
> > I can see how the device can complete #2,3 above. But the device driver
> > isn't the one managing page-responses right. So in order for the device
> > to know the above sequence is complete, it would need to get some
> > assist from IOMMU driver?  
> 
> No the device driver just waits for the device to indicate that it has
> completed the sequence. That's what the magic stop-PASID mechanism
> described by PCIe does. In 6.20.1 "Managing PASID TLP Prefix Usage" it
> says:
> 
> "A Function must have a mechanism to request that it gracefully stop using
>  a specific PASID. This mechanism is device specific but must satisfy the
>  following rules:
>  [...]
>  * When the stop request mechanism indicates completion, the Function has:
>[...]
>* Complied with additional rules described in Address Translation
>  Services (Chapter 10 [10.4.1.2 quoted above]) if Address Translations
>  or Page Requests were issued on the behalf of this PASID."
> 
> So after the device driver initiates this mechanism in the device, the
> device must be able to indicate completion of the mechanism, which
> includes completing all in-flight Page Requests. At that point the device
> driver can call unbind() knowing there is no pending PR for this PASID.
> 
In step #3, I think it is possible that device driver received page response
as part of the auto page response, so it may not guarantee all the in-flight
PRQs are completed inside IOMMU. Therefore, drain is _always_ needed to be
sure?

> Thanks,
> Jean
> 
> > 
> > How does the driver know that everything host received has been
> > responded back to device?
> >   
> > >   
> > > > I'm not sure about other IOMMU's how they behave, When there is no
> > > > space in the PRQ, IOMMU auto-responds to the device. This puts the
> > > > device in a while (1) loop. The fake successful response will let
> > > > the device do a ATS lookup, and that would fail forcing the device
> > > > to do another PRQ.  
> > > 
> > > But in the sequence above, step 1 should ensure that the device will
> > > not send another PR for any successful response coming back at step
> > > 3.  
> > 
> > True, but there could be some page-request in flight on its way to the
> > IOMMU. By draining and getting that round trip back to IOMMU we
> > gaurantee things in flight are flushed to PRQ after that Drain
> > completes.  
> > > 
> > > So I agree with the below if we suspect there could be pending PR, but
> > > given that pending PR are a stop marker thing and we don't know any
> > > device using stop markers, I wondered why I bothered implementing
> > > PRIq flush at all for SMMUv3, hence this RFC.
> > >   
> > 
> > Cheers,
> > Ashok  


Thanks,

Jacob

Re: [PATCH v3 00/14] IOASID extensions for guest SVA

2020-10-19 Thread Jacob Pan

Hi,

Any comments on this? I know we have some opens w.r.t. PASID management
UAPI, but I think having this common kernel API features should be
justified.

Thanks!

Jacob


On Mon, 28 Sep 2020 14:38:27 -0700, Jacob Pan 
wrote:

> IOASID was introduced in v5.5 as a generic kernel allocator service for
> both PCIe Process Address Space ID (PASID) and ARM SMMU's Sub Stream
> ID. In addition to basic ID allocation, ioasid_set was defined as a
> token that is shared by a group of IOASIDs. This set token can be used
> for permission checking, but lack of some features to address the
> following needs by guest Shared Virtual Address (SVA).
> - Manage IOASIDs by group, group ownership, quota, etc.
> - State synchronization among IOASID users
> - Non-identity guest-host IOASID mapping
> - Lifecycle management across many users
> 
> This patchset introduces the following extensions as solutions to the
> problems above.
> - Redefine and extend IOASID set such that IOASIDs can be managed by
> groups.
> - Add notifications for IOASID state synchronization
> - Add reference counting for life cycle alignment among users
> - Support ioasid_set private IDs, which can be used as guest IOASIDs
> Please refer to Documentation/ioasid.rst in enclosed patch 1/9 for more
> details.
> 
> This patchset only included VT-d driver as users of some of the new APIs.
> VFIO and KVM patches are coming up to fully utilize the APIs introduced
> here.
> 
> You can find this series at:
> https://github.com/jacobpan/linux.git ioasid_v3
> (VFIO and KVM patches will be available at this branch when published.)
> 
> This work is a result of collaboration with many people:
> Liu, Yi L 
> Wu Hao 
> Ashok Raj 
> Kevin Tian 
> 
> Thanks,
> 
> Jacob
> 
> Changelog:
> 
> V3:
> - Use consistent ioasid_set_ prefix for ioasid_set level APIs
> - Make SPID and private detach/attach APIs symmetric
> - Use the same ioasid_put semantics as Jean-Phillippe IOASID reference
> patch
> - Take away the public ioasid_notify() function, notifications are now
> emitted by IOASID core as a result of certain IOASID APIs
> - Partition into finer incremental patches
> - Miscellaneous cleanup, locking, exception handling fixes based on v2
> reviews
> 
> V2:
> - Redesigned ioasid_set APIs, removed set ID
> - Added set private ID (SPID) for guest PASID usage.
> - Add per ioasid_set notification and priority support.
> - Back to use spinlocks and atomic notifications.
> - Added async work in VT-d driver to perform teardown outside atomic
> context
> 
> Jacob Pan (14):
>   docs: Document IO Address Space ID (IOASID) APIs
>   iommu/ioasid: Rename ioasid_set_data()
>   iommu/ioasid: Add a separate function for detach data
>   iommu/ioasid: Support setting system-wide capacity
>   iommu/ioasid: Redefine IOASID set and allocation APIs
>   iommu/ioasid: Introduce API to adjust the quota of an ioasid_set
>   iommu/ioasid: Add an iterator API for ioasid_set
>   iommu/ioasid: Add reference couting functions
>   iommu/ioasid: Introduce ioasid_set private ID
>   iommu/ioasid: Introduce notification APIs
>   iommu/ioasid: Support mm type ioasid_set notifications
>   iommu/vt-d: Remove mm reference for guest SVA
>   iommu/vt-d: Listen to IOASID notifications
>   iommu/vt-d: Store guest PASID during bind
> 
>  Documentation/driver-api/ioasid.rst | 648 ++
>  drivers/iommu/intel/iommu.c |  29 +-
>  drivers/iommu/intel/pasid.h |   1 +
>  drivers/iommu/intel/svm.c   | 132 +-
>  drivers/iommu/ioasid.c  | 890
> ++-- include/linux/intel-iommu.h
> |   2 + include/linux/ioasid.h  | 197 +++-
>  7 files changed, 1830 insertions(+), 69 deletions(-)
>  create mode 100644 Documentation/driver-api/ioasid.rst
> 


Thanks,

Jacob

[PATCH v3 11/14] iommu/ioasid: Support mm type ioasid_set notifications

2020-09-28 Thread Jacob Pan

As a system-wide resource, IOASID is often shared by multiple kernel
subsystems that are independent of each other. However, at the
ioasid_set level, these kernel subsystems must communicate with each
other for ownership checking, event notifications, etc. For example, on
Intel Scalable IO Virtualization (SIOV) enabled platforms, KVM and VFIO
instances under the same process/guest must be aware of a shared IOASID
set.
IOASID_SET_TYPE_MM token type was introduced to explicitly mark an
IOASID set that belongs to a process, thus use the same mm_struct
pointer as a token. Users of the same process can then identify with
each other based on this token.

This patch introduces MM token specific event registration APIs. Event
subscribers such as KVM instances can register IOASID event handler
without the knowledge of its ioasid_set. Event handlers are registered
based on its mm_struct pointer as a token. In case when subscribers
register handler *prior* to the creation of the ioasid_set, the
handler’s notification block is stored in a pending list within IOASID
core. Once the ioasid_set of the MM token is created, the notification
block will be registered by the IOASID core.

Signed-off-by: Liu Yi L 
Signed-off-by: Wu Hao 
Signed-off-by: Jacob Pan 
---
 drivers/iommu/ioasid.c | 117 +
 include/linux/ioasid.h |  15 +++
 2 files changed, 132 insertions(+)

diff --git a/drivers/iommu/ioasid.c b/drivers/iommu/ioasid.c
index 894b17c06ead..d5faeb559a43 100644
--- a/drivers/iommu/ioasid.c
+++ b/drivers/iommu/ioasid.c
@@ -889,6 +889,29 @@ void ioasid_set_put(struct ioasid_set *set)
 }
 EXPORT_SYMBOL_GPL(ioasid_set_put);
 
+/*
+ * ioasid_find_mm_set - Retrieve IOASID set with mm token
+ * Take a reference of the set if found.
+ */
+static struct ioasid_set *ioasid_find_mm_set(struct mm_struct *token)
+{
+   struct ioasid_set *set;
+   unsigned long index;
+
+   spin_lock(_allocator_lock);
+
+   xa_for_each(_sets, index, set) {
+   if (set->type == IOASID_SET_TYPE_MM && set->token == token) {
+   refcount_inc(>ref);
+   goto exit_unlock;
+   }
+   }
+   set = NULL;
+exit_unlock:
+   spin_unlock(_allocator_lock);
+   return set;
+}
+
 /**
  * ioasid_adjust_set - Adjust the quota of an IOASID set
  * @set:   IOASID set to be assigned
@@ -1121,6 +1144,100 @@ void ioasid_unregister_notifier(struct ioasid_set *set,
 }
 EXPORT_SYMBOL_GPL(ioasid_unregister_notifier);
 
+/**
+ * ioasid_register_notifier_mm - Register a notifier block on the IOASID set
+ *   created by the mm_struct pointer as the token
+ *
+ * @mm: the mm_struct token of the ioasid_set
+ * @nb: notfier block to be registered on the ioasid_set
+ *
+ * This a variant of ioasid_register_notifier() where the caller intends to
+ * listen to IOASID events belong the ioasid_set created under the same
+ * process. Caller is not aware of the ioasid_set, no need to hold reference
+ * of the ioasid_set.
+ */
+int ioasid_register_notifier_mm(struct mm_struct *mm, struct notifier_block 
*nb)
+{
+   struct ioasid_set_nb *curr;
+   struct ioasid_set *set;
+   int ret = 0;
+
+   if (!mm)
+   return -EINVAL;
+
+   spin_lock(_nb_lock);
+
+   /* Check for duplicates, nb is unique per set */
+   list_for_each_entry(curr, _nb_pending_list, list) {
+   if (curr->token == mm && curr->nb == nb) {
+   ret = -EBUSY;
+   goto exit_unlock;
+   }
+   }
+
+   /* Check if the token has an existing set */
+   set = ioasid_find_mm_set(mm);
+   if (!set) {
+   /* Add to the rsvd list as inactive */
+   curr->active = false;
+   } else {
+   /* REVISIT: Only register empty set for now. Can add an option
+* in the future to playback existing PASIDs.
+*/
+   if (set->nr_ioasids) {
+   pr_warn("IOASID set %d not empty\n", set->id);
+   ret = -EBUSY;
+   goto exit_unlock;
+   }
+   curr = kzalloc(sizeof(*curr), GFP_ATOMIC);
+   if (!curr) {
+   ret = -ENOMEM;
+   goto exit_unlock;
+   }
+   curr->token = mm;
+   curr->nb = nb;
+   curr->active = true;
+   curr->set = set;
+
+   /* Set already created, add to the notifier chain */
+   atomic_notifier_chain_register(>nh, nb);
+   /*
+* Do not hold a reference, if the set gets destroyed, the nb
+* entry will be marked inactive.
+*/
+   ioasid_set_put(set);
+   }
+
+   list_add(>list, _nb_pending_list

[PATCH v3 06/14] iommu/ioasid: Introduce API to adjust the quota of an ioasid_set

2020-09-28 Thread Jacob Pan

Each ioasid_set is given a quota during allocation. As system
administrators balance resources among VMs, we shall support the
adjustment of quota at runtime. The new quota cannot be less than the
outstanding IOASIDs already allocated within the set. The extra quota
will be returned to the system-wide IOASID pool if the new quota is
smaller than the existing one.

Signed-off-by: Jacob Pan 
---
 drivers/iommu/ioasid.c | 47 +++
 include/linux/ioasid.h |  6 ++
 2 files changed, 53 insertions(+)

diff --git a/drivers/iommu/ioasid.c b/drivers/iommu/ioasid.c
index 61e25c2375ab..cf8c7d34e2de 100644
--- a/drivers/iommu/ioasid.c
+++ b/drivers/iommu/ioasid.c
@@ -654,6 +654,53 @@ void ioasid_set_put(struct ioasid_set *set)
 EXPORT_SYMBOL_GPL(ioasid_set_put);
 
 /**
+ * ioasid_adjust_set - Adjust the quota of an IOASID set
+ * @set:   IOASID set to be assigned
+ * @quota: Quota allowed in this set
+ *
+ * Return 0 on success. If the new quota is smaller than the number of
+ * IOASIDs already allocated, -EINVAL will be returned. No change will be
+ * made to the existing quota.
+ */
+int ioasid_adjust_set(struct ioasid_set *set, int quota)
+{
+   int ret = 0;
+
+   if (quota <= 0)
+   return -EINVAL;
+
+   spin_lock(_allocator_lock);
+   if (set->nr_ioasids > quota) {
+   pr_err("New quota %d is smaller than outstanding IOASIDs %d\n",
+   quota, set->nr_ioasids);
+   ret = -EINVAL;
+   goto done_unlock;
+   }
+
+   if ((quota > set->quota) &&
+   (quota - set->quota > ioasid_capacity_avail)) {
+   ret = -ENOSPC;
+   goto done_unlock;
+   }
+
+   /* Return the delta back to system pool */
+   ioasid_capacity_avail += set->quota - quota;
+
+   /*
+* May have a policy to prevent giving all available IOASIDs
+* to one set. But we don't enforce here, it should be in the
+* upper layers.
+*/
+   set->quota = quota;
+
+done_unlock:
+   spin_unlock(_allocator_lock);
+
+   return ret;
+}
+EXPORT_SYMBOL_GPL(ioasid_adjust_set);
+
+/**
  * ioasid_find - Find IOASID data
  * @set: the IOASID set
  * @ioasid: the IOASID to find
diff --git a/include/linux/ioasid.h b/include/linux/ioasid.h
index 1ae213b660f0..0a5e82148eb9 100644
--- a/include/linux/ioasid.h
+++ b/include/linux/ioasid.h
@@ -62,6 +62,7 @@ struct ioasid_allocator_ops {
 void ioasid_install_capacity(ioasid_t total);
 ioasid_t ioasid_get_capacity(void);
 struct ioasid_set *ioasid_set_alloc(void *token, ioasid_t quota, int type);
+int ioasid_adjust_set(struct ioasid_set *set, int quota);
 void ioasid_set_get(struct ioasid_set *set);
 void ioasid_set_put(struct ioasid_set *set);
 
@@ -99,6 +100,11 @@ static inline struct ioasid_set *ioasid_set_alloc(void 
*token, ioasid_t quota, i
return ERR_PTR(-ENOTSUPP);
 }
 
+static inline int ioasid_adjust_set(struct ioasid_set *set, int quota)
+{
+   return -ENOTSUPP;
+}
+
 static inline void ioasid_set_put(struct ioasid_set *set)
 {
 }
-- 
2.7.4

[PATCH v3 07/14] iommu/ioasid: Add an iterator API for ioasid_set

2020-09-28 Thread Jacob Pan

Users of an ioasid_set may not keep track of all the IOASIDs allocated
under the set. When collective actions are needed for each IOASIDs, it
is useful to iterate over all the IOASIDs within the set. For example,
when the ioasid_set is freed, the user might perform the same cleanup
operation on each IOASID.

This patch adds an API to iterate all the IOASIDs within the set.

Signed-off-by: Jacob Pan 
---
 drivers/iommu/ioasid.c | 17 +
 include/linux/ioasid.h |  9 +
 2 files changed, 26 insertions(+)

diff --git a/drivers/iommu/ioasid.c b/drivers/iommu/ioasid.c
index cf8c7d34e2de..9628e78b2ab4 100644
--- a/drivers/iommu/ioasid.c
+++ b/drivers/iommu/ioasid.c
@@ -701,6 +701,23 @@ int ioasid_adjust_set(struct ioasid_set *set, int quota)
 EXPORT_SYMBOL_GPL(ioasid_adjust_set);
 
 /**
+ * ioasid_set_for_each_ioasid - Iterate over all the IOASIDs within the set
+ *
+ * Caller must hold a reference of the set and handles its own locking.
+ */
+void ioasid_set_for_each_ioasid(struct ioasid_set *set,
+   void (*fn)(ioasid_t id, void *data),
+   void *data)
+{
+   struct ioasid_data *entry;
+   unsigned long index;
+
+   xa_for_each(>xa, index, entry)
+   fn(index, data);
+}
+EXPORT_SYMBOL_GPL(ioasid_set_for_each_ioasid);
+
+/**
  * ioasid_find - Find IOASID data
  * @set: the IOASID set
  * @ioasid: the IOASID to find
diff --git a/include/linux/ioasid.h b/include/linux/ioasid.h
index 0a5e82148eb9..aab58bc26714 100644
--- a/include/linux/ioasid.h
+++ b/include/linux/ioasid.h
@@ -75,6 +75,9 @@ int ioasid_register_allocator(struct ioasid_allocator_ops 
*allocator);
 void ioasid_unregister_allocator(struct ioasid_allocator_ops *allocator);
 int ioasid_attach_data(ioasid_t ioasid, void *data);
 void ioasid_detach_data(ioasid_t ioasid);
+void ioasid_set_for_each_ioasid(struct ioasid_set *sdata,
+   void (*fn)(ioasid_t id, void *data),
+   void *data);
 #else /* !CONFIG_IOASID */
 static inline void ioasid_install_capacity(ioasid_t total)
 {
@@ -131,5 +134,11 @@ static inline int ioasid_attach_data(ioasid_t ioasid, void 
*data)
 static inline void ioasid_detach_data(ioasid_t ioasid)
 {
 }
+
+static inline void ioasid_set_for_each_ioasid(struct ioasid_set *sdata,
+ void (*fn)(ioasid_t id, void 
*data),
+ void *data)
+{
+}
 #endif /* CONFIG_IOASID */
 #endif /* __LINUX_IOASID_H */
-- 
2.7.4

[PATCH v3 10/14] iommu/ioasid: Introduce notification APIs

2020-09-28 Thread Jacob Pan

Relations among IOASID users largely follow a publisher-subscriber
pattern. E.g. to support guest SVA on Intel Scalable I/O Virtualization
(SIOV) enabled platforms, VFIO, IOMMU, device drivers, KVM are all users
of IOASIDs. When a state change occurs, VFIO publishes the change event
that needs to be processed by other users/subscribers.

This patch introduced two types of notifications: global and per
ioasid_set. The latter is intended for users who only needs to handle
events related to the IOASID of a given set.
For more information, refer to the kernel documentation at
Documentation/ioasid.rst.

Signed-off-by: Liu Yi L 
Signed-off-by: Wu Hao 
Signed-off-by: Jacob Pan 
---
 drivers/iommu/ioasid.c | 141 +
 include/linux/ioasid.h |  57 +++-
 2 files changed, 197 insertions(+), 1 deletion(-)

diff --git a/drivers/iommu/ioasid.c b/drivers/iommu/ioasid.c
index 378fef8f23d9..894b17c06ead 100644
--- a/drivers/iommu/ioasid.c
+++ b/drivers/iommu/ioasid.c
@@ -10,12 +10,35 @@
 #include 
 #include 
 
+/*
+ * An IOASID can have multiple consumers where each consumer may have
+ * hardware contexts associated with the IOASID.
+ * When a status change occurs, like on IOASID deallocation, notifier chains
+ * are used to keep the consumers in sync.
+ * This is a publisher-subscriber pattern where publisher can change the
+ * state of each IOASID, e.g. alloc/free, bind IOASID to a device and mm.
+ * On the other hand, subscribers get notified for the state change and
+ * keep local states in sync.
+ */
+static ATOMIC_NOTIFIER_HEAD(ioasid_notifier);
+/* List to hold pending notification block registrations */
+static LIST_HEAD(ioasid_nb_pending_list);
+static DEFINE_SPINLOCK(ioasid_nb_lock);
+
 /* Default to PCIe standard 20 bit PASID */
 #define PCI_PASID_MAX 0x10
 static ioasid_t ioasid_capacity = PCI_PASID_MAX;
 static ioasid_t ioasid_capacity_avail = PCI_PASID_MAX;
 static DEFINE_XARRAY_ALLOC(ioasid_sets);
 
+struct ioasid_set_nb {
+   struct list_headlist;
+   struct notifier_block   *nb;
+   void*token;
+   struct ioasid_set   *set;
+   boolactive;
+};
+
 enum ioasid_state {
IOASID_STATE_INACTIVE,
IOASID_STATE_ACTIVE,
@@ -365,6 +388,42 @@ void ioasid_detach_data(ioasid_t ioasid)
 }
 EXPORT_SYMBOL_GPL(ioasid_detach_data);
 
+/**
+ * ioasid_notify - Send notification on a given IOASID for status change.
+ *
+ * @data:  The IOASID data to which the notification will send
+ * @cmd:   Notification event sent by IOASID external users, can be
+ * IOASID_BIND or IOASID_UNBIND.
+ *
+ * @flags: Special instructions, e.g. notify within a set or global by
+ * IOASID_NOTIFY_FLAG_SET or IOASID_NOTIFY_FLAG_ALL flags
+ * Caller must hold ioasid_allocator_lock and reference to the IOASID
+ */
+static int ioasid_notify(struct ioasid_data *data,
+enum ioasid_notify_val cmd, unsigned int flags)
+{
+   struct ioasid_nb_args args = { 0 };
+   int ret = 0;
+
+   /* IOASID_FREE/ALLOC are internal events emitted by IOASID core only */
+   if (cmd <= IOASID_NOTIFY_FREE)
+   return -EINVAL;
+
+   if (flags & ~(IOASID_NOTIFY_FLAG_ALL | IOASID_NOTIFY_FLAG_SET))
+   return -EINVAL;
+
+   args.id = data->id;
+   args.set = data->set;
+   args.pdata = data->private;
+   args.spid = data->spid;
+   if (flags & IOASID_NOTIFY_FLAG_ALL)
+   ret = atomic_notifier_call_chain(_notifier, cmd, );
+   if (flags & IOASID_NOTIFY_FLAG_SET)
+   ret = atomic_notifier_call_chain(>set->nh, cmd, );
+
+   return ret;
+}
+
 static ioasid_t ioasid_find_by_spid_locked(struct ioasid_set *set, ioasid_t 
spid)
 {
ioasid_t ioasid = INVALID_IOASID;
@@ -417,6 +476,7 @@ int ioasid_attach_spid(ioasid_t ioasid, ioasid_t spid)
goto done_unlock;
}
data->spid = spid;
+   ioasid_notify(data, IOASID_NOTIFY_BIND, IOASID_NOTIFY_FLAG_SET);
 
 done_unlock:
spin_unlock(_allocator_lock);
@@ -436,6 +496,7 @@ void ioasid_detach_spid(ioasid_t ioasid)
goto done_unlock;
}
data->spid = INVALID_IOASID;
+   ioasid_notify(data, IOASID_NOTIFY_UNBIND, IOASID_NOTIFY_FLAG_SET);
 
 done_unlock:
spin_unlock(_allocator_lock);
@@ -469,6 +530,28 @@ static inline bool ioasid_set_is_valid(struct ioasid_set 
*set)
return xa_load(_sets, set->id) == set;
 }
 
+static void ioasid_add_pending_nb(struct ioasid_set *set)
+{
+   struct ioasid_set_nb *curr;
+
+   if (set->type != IOASID_SET_TYPE_MM)
+   return;
+
+   /*
+* Check if there are any pending nb requests for the given token, if so
+* add them to the notifier chain.
+*/
+   spin_lock(_nb_lock);
+   list_for_each_entry(curr, _nb_pending_list, list) {

[PATCH v3 14/14] iommu/vt-d: Store guest PASID during bind

2020-09-28 Thread Jacob Pan

IOASID core maintains the guest-host mapping in the form of SPID and
IOASID. This patch assigns the guest PASID (if valid) as SPID while
binding guest page table with a host PASID. This mapping will be used
for lookup and notifications.

Signed-off-by: Jacob Pan 
---
 drivers/iommu/intel/svm.c | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/drivers/iommu/intel/svm.c b/drivers/iommu/intel/svm.c
index 8f886718df83..e18f8b5af9ba 100644
--- a/drivers/iommu/intel/svm.c
+++ b/drivers/iommu/intel/svm.c
@@ -98,6 +98,7 @@ static inline bool intel_svm_capable(struct intel_iommu 
*iommu)
 static inline void intel_svm_drop_pasid(ioasid_t pasid)
 {
ioasid_detach_data(pasid);
+   ioasid_detach_spid(pasid);
ioasid_put(NULL, pasid);
 }
 
@@ -425,6 +426,7 @@ int intel_svm_bind_gpasid(struct iommu_domain *domain, 
struct device *dev,
if (data->flags & IOMMU_SVA_GPASID_VAL) {
svm->gpasid = data->gpasid;
svm->flags |= SVM_FLAG_GUEST_PASID;
+   ioasid_attach_spid(data->hpasid, data->gpasid);
}
ioasid_attach_data(data->hpasid, svm);
ioasid_get(NULL, svm->pasid);
-- 
2.7.4

[PATCH v3 12/14] iommu/vt-d: Remove mm reference for guest SVA

2020-09-28 Thread Jacob Pan

Now that IOASID core keeps track of the IOASID to mm_struct ownership in
the forms of ioasid_set with IOASID_SET_TYPE_MM token type, there is no
need to keep the same mapping in VT-d driver specific data. Native SVM
usage is not affected by the change.

Signed-off-by: Jacob Pan 
---
 drivers/iommu/intel/svm.c | 7 ---
 1 file changed, 7 deletions(-)

diff --git a/drivers/iommu/intel/svm.c b/drivers/iommu/intel/svm.c
index 2e764e283469..39a09a93300e 100644
--- a/drivers/iommu/intel/svm.c
+++ b/drivers/iommu/intel/svm.c
@@ -338,12 +338,6 @@ int intel_svm_bind_gpasid(struct iommu_domain *domain, 
struct device *dev,
ret = -ENOMEM;
goto out;
}
-   /* REVISIT: upper layer/VFIO can track host process that bind
-* the PASID. ioasid_set = mm might be sufficient for vfio to
-* check pasid VMM ownership. We can drop the following line
-* once VFIO and IOASID set check is in place.
-*/
-   svm->mm = get_task_mm(current);
svm->pasid = data->hpasid;
if (data->flags & IOMMU_SVA_GPASID_VAL) {
svm->gpasid = data->gpasid;
@@ -351,7 +345,6 @@ int intel_svm_bind_gpasid(struct iommu_domain *domain, 
struct device *dev,
}
ioasid_attach_data(data->hpasid, svm);
INIT_LIST_HEAD_RCU(>devs);
-   mmput(svm->mm);
}
sdev = kzalloc(sizeof(*sdev), GFP_KERNEL);
if (!sdev) {
-- 
2.7.4

[PATCH v3 08/14] iommu/ioasid: Add reference couting functions

2020-09-28 Thread Jacob Pan

There can be multiple users of an IOASID, each user could have hardware
contexts associated with the IOASID. In order to align lifecycles,
reference counting is introduced in this patch. It is expected that when
an IOASID is being freed, each user will drop a reference only after its
context is cleared.

Signed-off-by: Jacob Pan 
---
 drivers/iommu/ioasid.c | 117 +
 include/linux/ioasid.h |  24 ++
 2 files changed, 141 insertions(+)

diff --git a/drivers/iommu/ioasid.c b/drivers/iommu/ioasid.c
index 9628e78b2ab4..828cc44b1b1c 100644
--- a/drivers/iommu/ioasid.c
+++ b/drivers/iommu/ioasid.c
@@ -16,8 +16,26 @@ static ioasid_t ioasid_capacity = PCI_PASID_MAX;
 static ioasid_t ioasid_capacity_avail = PCI_PASID_MAX;
 static DEFINE_XARRAY_ALLOC(ioasid_sets);
 
+enum ioasid_state {
+   IOASID_STATE_INACTIVE,
+   IOASID_STATE_ACTIVE,
+   IOASID_STATE_FREE_PENDING,
+};
+
+/**
+ * struct ioasid_data - Meta data about ioasid
+ *
+ * @id:Unique ID
+ * @users: Number of active users
+ * @state: Track state of the IOASID
+ * @set:   ioasid_set of the IOASID belongs to
+ * @private:   Private data associated with the IOASID
+ * @rcu:   For free after RCU grace period
+ */
 struct ioasid_data {
ioasid_t id;
+   refcount_t users;
+   enum ioasid_state state;
struct ioasid_set *set;
void *private;
struct rcu_head rcu;
@@ -511,6 +529,8 @@ ioasid_t ioasid_alloc(struct ioasid_set *set, ioasid_t min, 
ioasid_t max,
goto exit_free;
}
data->id = id;
+   data->state = IOASID_STATE_ACTIVE;
+   refcount_set(>users, 1);
 
/* Store IOASID in the per set data */
if (xa_err(xa_store(>xa, id, data, GFP_ATOMIC))) {
@@ -560,6 +580,14 @@ static void ioasid_free_locked(struct ioasid_set *set, 
ioasid_t ioasid)
if (WARN_ON(!xa_load(_sets, data->set->id)))
return;
 
+   /* Free is already in progress */
+   if (data->state == IOASID_STATE_FREE_PENDING)
+   return;
+
+   data->state = IOASID_STATE_FREE_PENDING;
+   if (!refcount_dec_and_test(>users))
+   return;
+
ioasid_do_free_locked(data);
 }
 
@@ -717,6 +745,95 @@ void ioasid_set_for_each_ioasid(struct ioasid_set *set,
 }
 EXPORT_SYMBOL_GPL(ioasid_set_for_each_ioasid);
 
+int ioasid_get_locked(struct ioasid_set *set, ioasid_t ioasid)
+{
+   struct ioasid_data *data;
+
+   data = xa_load(_allocator->xa, ioasid);
+   if (!data) {
+   pr_err("Trying to get unknown IOASID %u\n", ioasid);
+   return -EINVAL;
+   }
+   if (data->state == IOASID_STATE_FREE_PENDING) {
+   pr_err("Trying to get IOASID being freed%u\n", ioasid);
+   return -EBUSY;
+   }
+
+   /* Check set ownership if the set is non-null */
+   if (set && data->set != set) {
+   pr_err("Trying to get IOASID %u outside the set\n", ioasid);
+   /* data found but does not belong to the set */
+   return -EACCES;
+   }
+   refcount_inc(>users);
+
+   return 0;
+}
+EXPORT_SYMBOL_GPL(ioasid_get_locked);
+
+/**
+ * ioasid_get - Obtain a reference to an ioasid
+ * @set:   the ioasid_set to check permission against if not NULL
+ * @ioasid:the ID to remove
+ *
+ *
+ * Return: 0 on success, error if failed.
+ */
+int ioasid_get(struct ioasid_set *set, ioasid_t ioasid)
+{
+   int ret;
+
+   spin_lock(_allocator_lock);
+   ret = ioasid_get_locked(set, ioasid);
+   spin_unlock(_allocator_lock);
+
+   return ret;
+}
+EXPORT_SYMBOL_GPL(ioasid_get);
+
+bool ioasid_put_locked(struct ioasid_set *set, ioasid_t ioasid)
+{
+   struct ioasid_data *data;
+
+   data = xa_load(_allocator->xa, ioasid);
+   if (!data) {
+   pr_err("Trying to put unknown IOASID %u\n", ioasid);
+   return false;
+   }
+   if (set && data->set != set) {
+   pr_err("Trying to drop IOASID %u outside the set\n", ioasid);
+   return false;
+   }
+   if (!refcount_dec_and_test(>users))
+   return false;
+
+   ioasid_do_free_locked(data);
+
+   return true;
+}
+EXPORT_SYMBOL_GPL(ioasid_put_locked);
+
+/**
+ * ioasid_put - Release a reference to an ioasid
+ * @set:   the ioasid_set to check permission against if not NULL
+ * @ioasid:the ID to remove
+ *
+ * Put a reference to the IOASID, free it when the number of references drops 
to
+ * zero.
+ *
+ * Return: %true if the IOASID was freed, %false otherwise.
+ */
+bool ioasid_put(struct ioasid_set *set, ioasid_t ioasid)
+{
+   bool ret;
+
+   spin_lock(_allocator_lock);
+   ret = ioasid_put_locked(set, ioasid);
+   spin_unlock(_allocator_lock);
+   return ret;
+}
+EXPORT_SYMBOL_GPL(ioa

[PATCH v3 05/14] iommu/ioasid: Redefine IOASID set and allocation APIs

2020-09-28 Thread Jacob Pan

ioasid_set was introduced as an arbitrary token that is shared by a
group of IOASIDs. For example, two IOASIDs allocated via the same
ioasid_set pointer belong to the same set.

For guest SVA usages, system-wide IOASID resources need to be
partitioned such that each VM can have its own quota and being managed
separately. ioasid_set is the perfect candidate for meeting such
requirements. This patch redefines and extends ioasid_set with the
following new fields:
- Quota
- Reference count
- Storage of its namespace
- The token is now stored in the ioasid_set with types

Basic ioasid_set level APIs are introduced that wire up these new data.
Existing users of IOASID APIs are converted where a host IOASID set is
allocated for bare-metal usage.

Signed-off-by: Liu Yi L 
Signed-off-by: Jacob Pan 
---
 drivers/iommu/intel/iommu.c |  26 +++--
 drivers/iommu/intel/pasid.h |   1 +
 drivers/iommu/intel/svm.c   |  25 ++--
 drivers/iommu/ioasid.c  | 277 
 include/linux/ioasid.h  |  53 +++--
 5 files changed, 333 insertions(+), 49 deletions(-)

diff --git a/drivers/iommu/intel/iommu.c b/drivers/iommu/intel/iommu.c
index e7bcb299e51e..872391890323 100644
--- a/drivers/iommu/intel/iommu.c
+++ b/drivers/iommu/intel/iommu.c
@@ -104,6 +104,9 @@
  */
 #define INTEL_IOMMU_PGSIZES(~0xFFFUL)
 
+/* PASIDs used by host SVM */
+struct ioasid_set *host_pasid_set;
+
 static inline int agaw_to_level(int agaw)
 {
return agaw + 2;
@@ -3147,8 +3150,8 @@ static void intel_vcmd_ioasid_free(ioasid_t ioasid, void 
*data)
 * Sanity check the ioasid owner is done at upper layer, e.g. VFIO
 * We can only free the PASID when all the devices are unbound.
 */
-   if (ioasid_find(NULL, ioasid, NULL)) {
-   pr_alert("Cannot free active IOASID %d\n", ioasid);
+   if (IS_ERR(ioasid_find(host_pasid_set, ioasid, NULL))) {
+   pr_err("IOASID %d to be freed but not in system set\n", ioasid);
return;
}
vcmd_free_pasid(iommu, ioasid);
@@ -,8 +3336,17 @@ static int __init init_dmars(void)
goto free_iommu;
 
/* PASID is needed for scalable mode irrespective to SVM */
-   if (intel_iommu_sm)
+   if (intel_iommu_sm) {
ioasid_install_capacity(intel_pasid_max_id);
+   /* We should not run out of IOASIDs at boot */
+   host_pasid_set = ioasid_set_alloc(NULL, PID_MAX_DEFAULT,
+ IOASID_SET_TYPE_NULL);
+   if (IS_ERR_OR_NULL(host_pasid_set)) {
+   pr_err("Failed to allocate host PASID set %lu\n",
+   PTR_ERR(host_pasid_set));
+   intel_iommu_sm = 0;
+   }
+   }
 
/*
 * for each drhd
@@ -3381,7 +3393,7 @@ static int __init init_dmars(void)
disable_dmar_iommu(iommu);
free_dmar_iommu(iommu);
}
-
+   ioasid_set_put(host_pasid_set);
kfree(g_iommus);
 
 error:
@@ -5163,7 +5175,7 @@ static void auxiliary_unlink_device(struct dmar_domain 
*domain,
domain->auxd_refcnt--;
 
if (!domain->auxd_refcnt && domain->default_pasid > 0)
-   ioasid_free(domain->default_pasid);
+   ioasid_free(host_pasid_set, domain->default_pasid);
 }
 
 static int aux_domain_add_dev(struct dmar_domain *domain,
@@ -5181,7 +5193,7 @@ static int aux_domain_add_dev(struct dmar_domain *domain,
int pasid;
 
/* No private data needed for the default pasid */
-   pasid = ioasid_alloc(NULL, PASID_MIN,
+   pasid = ioasid_alloc(host_pasid_set, PASID_MIN,
 pci_max_pasids(to_pci_dev(dev)) - 1,
 NULL);
if (pasid == INVALID_IOASID) {
@@ -5224,7 +5236,7 @@ static int aux_domain_add_dev(struct dmar_domain *domain,
spin_unlock(>lock);
spin_unlock_irqrestore(_domain_lock, flags);
if (!domain->auxd_refcnt && domain->default_pasid > 0)
-   ioasid_free(domain->default_pasid);
+   ioasid_free(host_pasid_set, domain->default_pasid);
 
return ret;
 }
diff --git a/drivers/iommu/intel/pasid.h b/drivers/iommu/intel/pasid.h
index c9850766c3a9..ccdc23446015 100644
--- a/drivers/iommu/intel/pasid.h
+++ b/drivers/iommu/intel/pasid.h
@@ -99,6 +99,7 @@ static inline bool pasid_pte_is_present(struct pasid_entry 
*pte)
 }
 
 extern u32 intel_pasid_max_id;
+extern struct ioasid_set *host_pasid_set;
 int intel_pasid_alloc_id(void *ptr, int start, int end, gfp_t gfp);
 void intel_pasid_free_id(int pasid);
 void *intel_pasid_lookup_id(int pasid);
diff --git a/drivers/iommu/intel/svm.c b/drivers/iommu/intel/svm.c
index 06a16bee7b65..2e764e283469 100644
--- a/drivers/iommu/intel/svm.c
++

[PATCH v3 13/14] iommu/vt-d: Listen to IOASID notifications

2020-09-28 Thread Jacob Pan

On Intel Scalable I/O Virtualization (SIOV) enabled platforms, IOMMU
driver is one of the users of IOASIDs. In normal flow, callers will
perform IOASID allocation, bind, unbind, and free in order. However, for
guest SVA, IOASID free could come before unbind as guest is untrusted.
This patch registers IOASID notification handler such that IOMMU driver
can perform PASID teardown upon receiving an unexpected IOASID free
event.

Signed-off-by: Jacob Pan 
---
 drivers/iommu/intel/svm.c   | 94 -
 include/linux/intel-iommu.h |  2 +
 2 files changed, 94 insertions(+), 2 deletions(-)

diff --git a/drivers/iommu/intel/svm.c b/drivers/iommu/intel/svm.c
index 39a09a93300e..8f886718df83 100644
--- a/drivers/iommu/intel/svm.c
+++ b/drivers/iommu/intel/svm.c
@@ -95,6 +95,90 @@ static inline bool intel_svm_capable(struct intel_iommu 
*iommu)
return iommu->flags & VTD_FLAG_SVM_CAPABLE;
 }
 
+static inline void intel_svm_drop_pasid(ioasid_t pasid)
+{
+   ioasid_detach_data(pasid);
+   ioasid_put(NULL, pasid);
+}
+
+static DEFINE_MUTEX(pasid_mutex);
+#define pasid_lock_held() lock_is_held(_mutex.dep_map)
+
+static void intel_svm_free_async_fn(struct work_struct *work)
+{
+   struct intel_svm *svm = container_of(work, struct intel_svm, work);
+   struct intel_svm_dev *sdev;
+
+   /*
+* Unbind all devices associated with this PASID which is
+* being freed by other users such as VFIO.
+*/
+   mutex_lock(_mutex);
+   list_for_each_entry_rcu(sdev, >devs, list, pasid_lock_held()) {
+   /* Does not poison forward pointer */
+   list_del_rcu(>list);
+   spin_lock(>iommu->lock);
+   intel_pasid_tear_down_entry(svm->iommu, sdev->dev,
+   svm->pasid, true);
+   spin_unlock(>iommu->lock);
+   kfree_rcu(sdev, rcu);
+   }
+   /*
+* We may not be the last user to drop the reference but since
+* the PASID is in FREE_PENDING state, no one can get new reference.
+* Therefore, we can safely free the private data svm.
+*/
+   intel_svm_drop_pasid(svm->pasid);
+   /*
+* Free before unbind can only happen with host PASIDs used for
+* guest SVM. We get here because ioasid_free is called with
+* outstanding references. So we need to drop the reference
+* such that the PASID can be reclaimed. unbind_gpasid() after this
+* will not result in dropping refcount since the private data is
+* already detached.
+*/
+   kfree(svm);
+
+   mutex_unlock(_mutex);
+}
+
+
+static int pasid_status_change(struct notifier_block *nb,
+   unsigned long code, void *data)
+{
+   struct ioasid_nb_args *args = (struct ioasid_nb_args *)data;
+   struct intel_svm *svm = (struct intel_svm *)args->pdata;
+   int ret = NOTIFY_DONE;
+
+   if (code == IOASID_NOTIFY_FREE) {
+   /*
+* If PASID UNBIND happens before FREE, private data of the
+* IOASID should be NULL, then we don't need to do anything.
+*/
+   if (!svm)
+   goto done;
+   if (args->id != svm->pasid) {
+   pr_warn("Notify PASID does not match data %d : %d\n",
+   args->id, svm->pasid);
+   goto done;
+   }
+   schedule_work(>work);
+   return NOTIFY_OK;
+   }
+done:
+   return ret;
+}
+
+static struct notifier_block pasid_nb = {
+   .notifier_call = pasid_status_change,
+};
+
+void intel_svm_add_pasid_notifier(void)
+{
+   /* Listen to all PASIDs, not specific to a set */
+   ioasid_register_notifier(NULL, _nb);
+}
+
 void intel_svm_check(struct intel_iommu *iommu)
 {
if (!pasid_supported(iommu))
@@ -221,7 +305,6 @@ static const struct mmu_notifier_ops intel_mmuops = {
.invalidate_range = intel_invalidate_range,
 };
 
-static DEFINE_MUTEX(pasid_mutex);
 static LIST_HEAD(global_svm_list);
 
 #define for_each_svm_dev(sdev, svm, d) \
@@ -344,6 +427,13 @@ int intel_svm_bind_gpasid(struct iommu_domain *domain, 
struct device *dev,
svm->flags |= SVM_FLAG_GUEST_PASID;
}
ioasid_attach_data(data->hpasid, svm);
+   ioasid_get(NULL, svm->pasid);
+   svm->iommu = iommu;
+   /*
+* Set up cleanup async work in case IOASID core notify us PASID
+* is freed before unbind.
+*/
+   INIT_WORK(>work, intel_svm_free_async_fn);
INIT_LIST_HEAD_RCU(>devs);
}
sdev = kzalloc(sizeof(*sdev), GFP_KERNEL);
@@ -437,7 +527,7 @@ int intel_svm_unbind_gpasid(struct

[PATCH v3 02/14] iommu/ioasid: Rename ioasid_set_data()

2020-09-28 Thread Jacob Pan

Rename ioasid_set_data() to ioasid_attach_data() to avoid confusion with
struct ioasid_set. ioasid_set is a group of IOASIDs that share a common
token.

Reviewed-by: Jean-Philippe Brucker 
Signed-off-by: Jacob Pan 
---
 drivers/iommu/intel/svm.c | 6 +++---
 drivers/iommu/ioasid.c| 6 +++---
 include/linux/ioasid.h| 4 ++--
 3 files changed, 8 insertions(+), 8 deletions(-)

diff --git a/drivers/iommu/intel/svm.c b/drivers/iommu/intel/svm.c
index 0cb9a15f1112..2c5645f0737a 100644
--- a/drivers/iommu/intel/svm.c
+++ b/drivers/iommu/intel/svm.c
@@ -346,7 +346,7 @@ int intel_svm_bind_gpasid(struct iommu_domain *domain, 
struct device *dev,
svm->gpasid = data->gpasid;
svm->flags |= SVM_FLAG_GUEST_PASID;
}
-   ioasid_set_data(data->hpasid, svm);
+   ioasid_attach_data(data->hpasid, svm);
INIT_LIST_HEAD_RCU(>devs);
mmput(svm->mm);
}
@@ -398,7 +398,7 @@ int intel_svm_bind_gpasid(struct iommu_domain *domain, 
struct device *dev,
list_add_rcu(>list, >devs);
  out:
if (!IS_ERR_OR_NULL(svm) && list_empty(>devs)) {
-   ioasid_set_data(data->hpasid, NULL);
+   ioasid_attach_data(data->hpasid, NULL);
kfree(svm);
}
 
@@ -441,7 +441,7 @@ int intel_svm_unbind_gpasid(struct device *dev, int pasid)
 * the unbind, IOMMU driver will get notified
 * and perform cleanup.
 */
-   ioasid_set_data(pasid, NULL);
+   ioasid_attach_data(pasid, NULL);
kfree(svm);
}
}
diff --git a/drivers/iommu/ioasid.c b/drivers/iommu/ioasid.c
index 0f8dd377aada..5f63af07acd5 100644
--- a/drivers/iommu/ioasid.c
+++ b/drivers/iommu/ioasid.c
@@ -258,14 +258,14 @@ void ioasid_unregister_allocator(struct 
ioasid_allocator_ops *ops)
 EXPORT_SYMBOL_GPL(ioasid_unregister_allocator);
 
 /**
- * ioasid_set_data - Set private data for an allocated ioasid
+ * ioasid_attach_data - Set private data for an allocated ioasid
  * @ioasid: the ID to set data
  * @data:   the private data
  *
  * For IOASID that is already allocated, private data can be set
  * via this API. Future lookup can be done via ioasid_find.
  */
-int ioasid_set_data(ioasid_t ioasid, void *data)
+int ioasid_attach_data(ioasid_t ioasid, void *data)
 {
struct ioasid_data *ioasid_data;
int ret = 0;
@@ -287,7 +287,7 @@ int ioasid_set_data(ioasid_t ioasid, void *data)
 
return ret;
 }
-EXPORT_SYMBOL_GPL(ioasid_set_data);
+EXPORT_SYMBOL_GPL(ioasid_attach_data);
 
 /**
  * ioasid_alloc - Allocate an IOASID
diff --git a/include/linux/ioasid.h b/include/linux/ioasid.h
index 6f000d7a0ddc..9c44947a68c8 100644
--- a/include/linux/ioasid.h
+++ b/include/linux/ioasid.h
@@ -39,7 +39,7 @@ void *ioasid_find(struct ioasid_set *set, ioasid_t ioasid,
  bool (*getter)(void *));
 int ioasid_register_allocator(struct ioasid_allocator_ops *allocator);
 void ioasid_unregister_allocator(struct ioasid_allocator_ops *allocator);
-int ioasid_set_data(ioasid_t ioasid, void *data);
+int ioasid_attach_data(ioasid_t ioasid, void *data);
 
 #else /* !CONFIG_IOASID */
 static inline ioasid_t ioasid_alloc(struct ioasid_set *set, ioasid_t min,
@@ -67,7 +67,7 @@ static inline void ioasid_unregister_allocator(struct 
ioasid_allocator_ops *allo
 {
 }
 
-static inline int ioasid_set_data(ioasid_t ioasid, void *data)
+static inline int ioasid_attach_data(ioasid_t ioasid, void *data)
 {
return -ENOTSUPP;
 }
-- 
2.7.4

[PATCH v3 04/14] iommu/ioasid: Support setting system-wide capacity

2020-09-28 Thread Jacob Pan

IOASID is a system-wide resource that could vary on different systems.
The default capacity is 20 bits as defined in the PCI-E specifications.
This patch adds a function to allow adjusting system IOASID capacity.
For VT-d this is set during boot as part of the Intel IOMMU
initialization.

Signed-off-by: Jacob Pan 
---
 drivers/iommu/intel/iommu.c |  5 +
 drivers/iommu/ioasid.c  | 20 
 include/linux/ioasid.h  | 11 +++
 3 files changed, 36 insertions(+)

diff --git a/drivers/iommu/intel/iommu.c b/drivers/iommu/intel/iommu.c
index 18ed3b3c70d7..e7bcb299e51e 100644
--- a/drivers/iommu/intel/iommu.c
+++ b/drivers/iommu/intel/iommu.c
@@ -42,6 +42,7 @@
 #include 
 #include 
 #include 
+#include 
 #include 
 #include 
 #include 
@@ -3331,6 +3332,10 @@ static int __init init_dmars(void)
if (ret)
goto free_iommu;
 
+   /* PASID is needed for scalable mode irrespective to SVM */
+   if (intel_iommu_sm)
+   ioasid_install_capacity(intel_pasid_max_id);
+
/*
 * for each drhd
 *   enable fault log
diff --git a/drivers/iommu/ioasid.c b/drivers/iommu/ioasid.c
index 6cfbdfb492e0..4277cb17e15b 100644
--- a/drivers/iommu/ioasid.c
+++ b/drivers/iommu/ioasid.c
@@ -10,6 +10,10 @@
 #include 
 #include 
 
+/* Default to PCIe standard 20 bit PASID */
+#define PCI_PASID_MAX 0x10
+static ioasid_t ioasid_capacity = PCI_PASID_MAX;
+static ioasid_t ioasid_capacity_avail = PCI_PASID_MAX;
 struct ioasid_data {
ioasid_t id;
struct ioasid_set *set;
@@ -17,6 +21,22 @@ struct ioasid_data {
struct rcu_head rcu;
 };
 
+void ioasid_install_capacity(ioasid_t total)
+{
+   if (ioasid_capacity && ioasid_capacity != PCI_PASID_MAX) {
+   pr_warn("IOASID capacity is already set.\n");
+   return;
+   }
+   ioasid_capacity = ioasid_capacity_avail = total;
+}
+EXPORT_SYMBOL_GPL(ioasid_install_capacity);
+
+ioasid_t ioasid_get_capacity(void)
+{
+   return ioasid_capacity;
+}
+EXPORT_SYMBOL_GPL(ioasid_get_capacity);
+
 /*
  * struct ioasid_allocator_data - Internal data structure to hold information
  * about an allocator. There are two types of allocators:
diff --git a/include/linux/ioasid.h b/include/linux/ioasid.h
index c7f649fa970a..7fc320656be2 100644
--- a/include/linux/ioasid.h
+++ b/include/linux/ioasid.h
@@ -32,6 +32,8 @@ struct ioasid_allocator_ops {
 #define DECLARE_IOASID_SET(name) struct ioasid_set name = { 0 }
 
 #if IS_ENABLED(CONFIG_IOASID)
+void ioasid_install_capacity(ioasid_t total);
+ioasid_t ioasid_get_capacity(void);
 ioasid_t ioasid_alloc(struct ioasid_set *set, ioasid_t min, ioasid_t max,
  void *private);
 void ioasid_free(ioasid_t ioasid);
@@ -42,6 +44,15 @@ void ioasid_unregister_allocator(struct ioasid_allocator_ops 
*allocator);
 int ioasid_attach_data(ioasid_t ioasid, void *data);
 void ioasid_detach_data(ioasid_t ioasid);
 #else /* !CONFIG_IOASID */
+static inline void ioasid_install_capacity(ioasid_t total)
+{
+}
+
+static inline ioasid_t ioasid_get_capacity(void)
+{
+   return 0;
+}
+
 static inline ioasid_t ioasid_alloc(struct ioasid_set *set, ioasid_t min,
ioasid_t max, void *private)
 {
-- 
2.7.4

1 2 3 4 5 6 7 8 9 10 >

1 - 100 of 1596 matches

Mail list logo