[PATCH] intel-iommu: Remove all IOVA handling code from the non-dma_ops path in the intel

2020-04-01 Thread Tom Murphy
There's no need for the non-dma_ops path to keep track of IOVAs. The
whole point of the non-dma_ops path is that it allows the IOVAs to be
handled separately. The IOVA handling code removed in this patch is
pointless.

Signed-off-by: Tom Murphy 
---
 drivers/iommu/intel-iommu.c | 97 +
 1 file changed, 33 insertions(+), 64 deletions(-)

diff --git a/drivers/iommu/intel-iommu.c b/drivers/iommu/intel-iommu.c
index 4be549478691..b92606979914 100644
--- a/drivers/iommu/intel-iommu.c
+++ b/drivers/iommu/intel-iommu.c
@@ -1911,11 +1911,6 @@ static int dmar_init_reserved_ranges(void)
return 0;
 }
 
-static void domain_reserve_special_ranges(struct dmar_domain *domain)
-{
-   copy_reserved_iova(_iova_list, >iovad);
-}
-
 static inline int guestwidth_to_adjustwidth(int gaw)
 {
int agaw;
@@ -1946,7 +1941,7 @@ static int domain_init(struct dmar_domain *domain, struct 
intel_iommu *iommu,
pr_info("iova flush queue initialization failed\n");
}
 
-   domain_reserve_special_ranges(domain);
+   copy_reserved_iova(_iova_list, >iovad);
 
/* calculate AGAW */
if (guest_width > cap_mgaw(iommu->cap))
@@ -1996,7 +1991,8 @@ static void domain_exit(struct dmar_domain *domain)
domain_remove_dev_info(domain);
 
/* destroy iovas */
-   put_iova_domain(>iovad);
+   if (domain->domain.type == IOMMU_DOMAIN_DMA)
+   put_iova_domain(>iovad);
 
if (domain->pgd) {
struct page *freelist;
@@ -2793,19 +2789,9 @@ static struct dmar_domain *set_domain_for_dev(struct 
device *dev,
 }
 
 static int iommu_domain_identity_map(struct dmar_domain *domain,
-unsigned long long start,
-unsigned long long end)
+unsigned long first_vpfn,
+unsigned long last_vpfn)
 {
-   unsigned long first_vpfn = start >> VTD_PAGE_SHIFT;
-   unsigned long last_vpfn = end >> VTD_PAGE_SHIFT;
-
-   if (!reserve_iova(>iovad, dma_to_mm_pfn(first_vpfn),
- dma_to_mm_pfn(last_vpfn))) {
-   pr_err("Reserving iova failed\n");
-   return -ENOMEM;
-   }
-
-   pr_debug("Mapping reserved region %llx-%llx\n", start, end);
/*
 * RMRR range might have overlap with physical memory range,
 * clear it first
@@ -2882,7 +2868,8 @@ static int __init si_domain_init(int hw)
 
for_each_mem_pfn_range(i, nid, _pfn, _pfn, NULL) {
ret = iommu_domain_identity_map(si_domain,
-   PFN_PHYS(start_pfn), PFN_PHYS(end_pfn));
+   mm_to_dma_pfn(start_pfn),
+   mm_to_dma_pfn(end_pfn));
if (ret)
return ret;
}
@@ -4812,58 +4799,37 @@ static int intel_iommu_memory_notifier(struct 
notifier_block *nb,
   unsigned long val, void *v)
 {
struct memory_notify *mhp = v;
-   unsigned long long start, end;
-   unsigned long start_vpfn, last_vpfn;
+   unsigned long start_vpfn = mm_to_dma_pfn(mhp->start_pfn);
+   unsigned long last_vpfn = mm_to_dma_pfn(mhp->start_pfn +
+   mhp->nr_pages - 1);
 
switch (val) {
case MEM_GOING_ONLINE:
-   start = mhp->start_pfn << PAGE_SHIFT;
-   end = ((mhp->start_pfn + mhp->nr_pages) << PAGE_SHIFT) - 1;
-   if (iommu_domain_identity_map(si_domain, start, end)) {
-   pr_warn("Failed to build identity map for 
[%llx-%llx]\n",
-   start, end);
+   if (iommu_domain_identity_map(si_domain, start_vpfn,
+   last_vpfn)) {
+   pr_warn("Failed to build identity map for [%lx-%lx]\n",
+   start_vpfn, last_vpfn);
return NOTIFY_BAD;
}
break;
 
case MEM_OFFLINE:
case MEM_CANCEL_ONLINE:
-   start_vpfn = mm_to_dma_pfn(mhp->start_pfn);
-   last_vpfn = mm_to_dma_pfn(mhp->start_pfn + mhp->nr_pages - 1);
-   while (start_vpfn <= last_vpfn) {
-   struct iova *iova;
+   {
struct dmar_drhd_unit *drhd;
struct intel_iommu *iommu;
struct page *freelist;
 
-   iova = find_iova(_domain->iovad, start_vpfn);
-   if (iova == NULL) {
-   pr_debug("Failed get IOVA for PFN %lx\n",
-start_vpfn);
-   break;
-   }
-
-   iova = 

RE: [EXT] Re: [RFC PATCH v2] iommu/virtio: Use page size bitmap supported by endpoint

2020-04-01 Thread Bharat Bhushan



> -Original Message-
> From: Jean-Philippe Brucker 
> Sent: Wednesday, April 1, 2020 9:20 PM
> To: Robin Murphy 
> Cc: Bharat Bhushan ; j...@8bytes.org;
> m...@redhat.com; jasow...@redhat.com; virtualization@lists.linux-
> foundation.org; iommu@lists.linux-foundation.org; 
> linux-ker...@vger.kernel.org;
> eric.auger@gmail.com; eric.au...@redhat.com
> Subject: [EXT] Re: [RFC PATCH v2] iommu/virtio: Use page size bitmap 
> supported by
> endpoint
> 
> External Email
> 
> --
> On Wed, Apr 01, 2020 at 02:00:13PM +0100, Robin Murphy wrote:
> > On 2020-04-01 12:38 pm, Bharat Bhushan wrote:
> > > Different endpoint can support different page size, probe endpoint
> > > if it supports specific page size otherwise use global page sizes.
> > >
> > > Signed-off-by: Bharat Bhushan 
> > > ---
> > >   drivers/iommu/virtio-iommu.c  | 33 +++
> > >   include/uapi/linux/virtio_iommu.h |  7 +++
> > >   2 files changed, 36 insertions(+), 4 deletions(-)
> > >
> > > diff --git a/drivers/iommu/virtio-iommu.c
> > > b/drivers/iommu/virtio-iommu.c index cce329d71fba..c794cb5b7b3e
> > > 100644
> > > --- a/drivers/iommu/virtio-iommu.c
> > > +++ b/drivers/iommu/virtio-iommu.c
> > > @@ -78,6 +78,7 @@ struct viommu_endpoint {
> > >   struct viommu_dev   *viommu;
> > >   struct viommu_domain*vdomain;
> > >   struct list_headresv_regions;
> > > + u64 pgsize_bitmap;
> > >   };
> > >   struct viommu_request {
> > > @@ -415,6 +416,20 @@ static int viommu_replay_mappings(struct
> viommu_domain *vdomain)
> > >   return ret;
> > >   }
> > > +static int viommu_set_pgsize_bitmap(struct viommu_endpoint *vdev,
> > > + struct virtio_iommu_probe_pgsize_mask *mask,
> > > + size_t len)
> > > +
> > > +{
> > > + u64 pgsize_bitmap = le64_to_cpu(mask->pgsize_bitmap);
> > > +
> > > + if (len < sizeof(*mask))
> > > + return -EINVAL;
> > > +
> > > + vdev->pgsize_bitmap = pgsize_bitmap;
> > > + return 0;
> > > +}
> > > +
> > >   static int viommu_add_resv_mem(struct viommu_endpoint *vdev,
> > >  struct virtio_iommu_probe_resv_mem *mem,
> > >  size_t len)
> > > @@ -494,11 +509,13 @@ static int viommu_probe_endpoint(struct
> viommu_dev *viommu, struct device *dev)
> > >   while (type != VIRTIO_IOMMU_PROBE_T_NONE &&
> > >  cur < viommu->probe_size) {
> > >   len = le16_to_cpu(prop->length) + sizeof(*prop);
> > > -
> 
> Whitespace change
> 
> > >   switch (type) {
> > >   case VIRTIO_IOMMU_PROBE_T_RESV_MEM:
> > >   ret = viommu_add_resv_mem(vdev, (void *)prop, 
> > > len);
> > >   break;
> > > + case VIRTIO_IOMMU_PROBE_T_PAGE_SIZE_MASK:
> > > + ret = viommu_set_pgsize_bitmap(vdev, (void *)prop, len);
> > > + break;
> > >   default:
> > >   dev_err(dev, "unknown viommu prop 0x%x\n", 
> > > type);
> > >   }
> > > @@ -607,16 +624,23 @@ static struct iommu_domain
> *viommu_domain_alloc(unsigned type)
> > >   return >domain;
> > >   }
> > > -static int viommu_domain_finalise(struct viommu_dev *viommu,
> > > +static int viommu_domain_finalise(struct viommu_endpoint *vdev,
> > > struct iommu_domain *domain)
> > >   {
> > >   int ret;
> > >   struct viommu_domain *vdomain = to_viommu_domain(domain);
> > > + struct viommu_dev *viommu = vdev->viommu;
> > >   vdomain->viommu = viommu;
> > >   vdomain->map_flags  = viommu->map_flags;
> > > - domain->pgsize_bitmap   = viommu->pgsize_bitmap;
> > > + /* Devices in same domain must support same size pages */
> >
> > AFAICS what the code appears to do is enforce that the first endpoint
> > attached to any domain has the same pgsize_bitmap as the most recently
> > probed viommu_dev instance, then ignore any subsequent endpoints
> > attached to the same domain. Thus I'm not sure that comment is accurate.
> >
> 
> Yes viommu_domain_finalise() is only called once. What I had in mind is
> something like:

Ohh, I did not realized that viommu_domain_finalise().
I have included below suggested changes

> 
>  8< 
> diff --git a/drivers/iommu/virtio-iommu.c b/drivers/iommu/virtio-iommu.c index
> 750f69c49b95..8303b7b513ff 100644
> --- a/drivers/iommu/virtio-iommu.c
> +++ b/drivers/iommu/virtio-iommu.c
> @@ -639,6 +639,29 @@ static int viommu_domain_finalise(struct
> viommu_endpoint *vdev,
>   return 0;
>  }
> 
> +/*
> + * Check whether the endpoint's capabilities are compatible with other
> +endpoints
> + * in the domain. Report any inconsistency.
> + */
> +static bool 

RE: [PATCH V10 11/11] iommu/vt-d: Add custom allocator for IOASID

2020-04-01 Thread Tian, Kevin
> From: Jacob Pan 
> Sent: Wednesday, April 1, 2020 11:48 PM
> 
> On Sat, 28 Mar 2020 10:22:41 +
> "Tian, Kevin"  wrote:
> 
> > > From: Jacob Pan 
> > > Sent: Saturday, March 21, 2020 7:28 AM
> > >
> > > When VT-d driver runs in the guest, PASID allocation must be
> > > performed via virtual command interface. This patch registers a
> > > custom IOASID allocator which takes precedence over the default
> > > XArray based allocator. The resulting IOASID allocation will always
> > > come from the host. This ensures that PASID namespace is system-
> > > wide.
> > >
> > > Signed-off-by: Lu Baolu 
> > > Signed-off-by: Liu, Yi L 
> > > Signed-off-by: Jacob Pan 
> > > ---
> > >  drivers/iommu/intel-iommu.c | 84
> > > +
> > >  include/linux/intel-iommu.h |  2 ++
> > >  2 files changed, 86 insertions(+)
> > >
> > > diff --git a/drivers/iommu/intel-iommu.c
> > > b/drivers/iommu/intel-iommu.c index a76afb0fd51a..c1c0b0fb93c3
> > > 100644 --- a/drivers/iommu/intel-iommu.c
> > > +++ b/drivers/iommu/intel-iommu.c
> > > @@ -1757,6 +1757,9 @@ static void free_dmar_iommu(struct
> intel_iommu
> > > *iommu)
> > >   if (ecap_prs(iommu->ecap))
> > >   intel_svm_finish_prq(iommu);
> > >   }
> > > + if (ecap_vcs(iommu->ecap) && vccap_pasid(iommu->vccap))
> > > +
> > > ioasid_unregister_allocator(>pasid_allocator); +
> > >  #endif
> > >  }
> > >
> > > @@ -3291,6 +3294,84 @@ static int copy_translation_tables(struct
> > > intel_iommu *iommu)
> > >   return ret;
> > >  }
> > >
> > > +#ifdef CONFIG_INTEL_IOMMU_SVM
> > > +static ioasid_t intel_ioasid_alloc(ioasid_t min, ioasid_t max,
> > > void *data)
> >
> > the name is too generic... can we add vcmd in the name to clarify
> > its purpose, e.g. intel_vcmd_ioasid_alloc?
> >
> I feel the intel_ prefix is a natural extension of a generic API, we do
> that for other IOMMU APIs, right?

other IOMMU APIs have no difference between host and guest, but
this one only applies to guest with vcmd interface. 

> 
> > > +{
> > > + struct intel_iommu *iommu = data;
> > > + ioasid_t ioasid;
> > > +
> > > + if (!iommu)
> > > + return INVALID_IOASID;
> > > + /*
> > > +  * VT-d virtual command interface always uses the full 20
> > > bit
> > > +  * PASID range. Host can partition guest PASID range based
> > > on
> > > +  * policies but it is out of guest's control.
> > > +  */
> > > + if (min < PASID_MIN || max > intel_pasid_max_id)
> > > + return INVALID_IOASID;
> > > +
> > > + if (vcmd_alloc_pasid(iommu, ))
> > > + return INVALID_IOASID;
> > > +
> > > + return ioasid;
> > > +}
> > > +
> > > +static void intel_ioasid_free(ioasid_t ioasid, void *data)
> > > +{
> > > + struct intel_iommu *iommu = data;
> > > +
> > > + if (!iommu)
> > > + return;
> > > + /*
> > > +  * Sanity check the ioasid owner is done at upper layer,
> > > e.g. VFIO
> > > +  * We can only free the PASID when all the devices are
> > > unbound.
> > > +  */
> > > + if (ioasid_find(NULL, ioasid, NULL)) {
> > > + pr_alert("Cannot free active IOASID %d\n", ioasid);
> > > + return;
> > > + }
> >
> > However the sanity check is not done in default_free. Is there a
> > reason why using vcmd adds such  new requirement?
> >
> Since we don't support nested guest. This vcmd allocator is only used
> by the guest IOMMU driver not VFIO. We expect IOMMU driver to have
> control of the free()/unbind() ordering.
> 
> For default_free, it can come from user space and host VFIO which can
> be out of order. But we will solve that issue with the blocking
> notifier.
> 
> > > + vcmd_free_pasid(iommu, ioasid);
> > > +}
> > > +
> > > +static void register_pasid_allocator(struct intel_iommu *iommu)
> > > +{
> > > + /*
> > > +  * If we are running in the host, no need for custom
> > > allocator
> > > +  * in that PASIDs are allocated from the host system-wide.
> > > +  */
> > > + if (!cap_caching_mode(iommu->cap))
> > > + return;
> >
> > is it more accurate to check against vcmd capability?
> >
> I think this is sufficient. The spec says if vcmd is present, we must
> use it but not the other way.

No, what about an vIOMMU implementation reports CM but not
VCMD? I didn't get the rationale why we check an indirect capability
when there is already one well defined for the purpose.

> 
> > > +
> > > + if (!sm_supported(iommu)) {
> > > + pr_warn("VT-d Scalable Mode not enabled, no PASID
> > > allocation\n");
> > > + return;
> > > + }
> > > +
> > > + /*
> > > +  * Register a custom PASID allocator if we are running in
> > > a guest,
> > > +  * guest PASID must be obtained via virtual command
> > > interface.
> > > +  * There can be multiple vIOMMUs in each guest but only one
> > > allocator
> > > +  * is active. All vIOMMU allocators will eventually be
> > > calling the same
> >
> > which one? the first or last?
> >
> All allocators share the same ops, so first=last. IOASID code will
> inspect the ops function and see if they 

RE: [PATCH v1 6/8] vfio/type1: Bind guest page tables to host

2020-04-01 Thread Tian, Kevin
> From: Liu, Yi L 
> Sent: Wednesday, April 1, 2020 5:13 PM
> 
> > From: Tian, Kevin 
> > Sent: Monday, March 30, 2020 8:46 PM
> > Subject: RE: [PATCH v1 6/8] vfio/type1: Bind guest page tables to host
> >
> > > From: Liu, Yi L 
> > > Sent: Sunday, March 22, 2020 8:32 PM
> > >
> > > From: Liu Yi L 
> > >
> > > VFIO_TYPE1_NESTING_IOMMU is an IOMMU type which is backed by
> hardware
> > > IOMMUs that have nesting DMA translation (a.k.a dual stage address
> > > translation). For such hardware IOMMUs, there are two stages/levels of
> > > address translation, and software may let userspace/VM to own the
> > > first-
> > > level/stage-1 translation structures. Example of such usage is vSVA (
> > > virtual Shared Virtual Addressing). VM owns the first-level/stage-1
> > > translation structures and bind the structures to host, then hardware
> > > IOMMU would utilize nesting translation when doing DMA translation fo
> > > the devices behind such hardware IOMMU.
> > >
> > > This patch adds vfio support for binding guest translation (a.k.a
> > > stage 1) structure to host iommu. And for VFIO_TYPE1_NESTING_IOMMU,
> > > not only bind guest page table is needed, it also requires to expose
> > > interface to guest for iommu cache invalidation when guest modified
> > > the first-level/stage-1 translation structures since hardware needs to
> > > be notified to flush stale iotlbs. This would be introduced in next
> > > patch.
> > >
> > > In this patch, guest page table bind and unbind are done by using
> > > flags VFIO_IOMMU_BIND_GUEST_PGTBL and
> > VFIO_IOMMU_UNBIND_GUEST_PGTBL
> > > under IOCTL VFIO_IOMMU_BIND, the bind/unbind data are conveyed by
> > > struct iommu_gpasid_bind_data. Before binding guest page table to
> > > host, VM should have got a PASID allocated by host via
> > > VFIO_IOMMU_PASID_REQUEST.
> > >
> > > Bind guest translation structures (here is guest page table) to host
> >
> > Bind -> Binding
> got it.
> > > are the first step to setup vSVA (Virtual Shared Virtual Addressing).
> >
> > are -> is. and you already explained vSVA earlier.
> oh yes, it is.
> > >
> > > Cc: Kevin Tian 
> > > CC: Jacob Pan 
> > > Cc: Alex Williamson 
> > > Cc: Eric Auger 
> > > Cc: Jean-Philippe Brucker 
> > > Signed-off-by: Jean-Philippe Brucker 
> > > Signed-off-by: Liu Yi L 
> > > Signed-off-by: Jacob Pan 
> > > ---
> > >  drivers/vfio/vfio_iommu_type1.c | 158
> > > 
> > >  include/uapi/linux/vfio.h   |  46 
> > >  2 files changed, 204 insertions(+)
> > >
> > > diff --git a/drivers/vfio/vfio_iommu_type1.c
> > > b/drivers/vfio/vfio_iommu_type1.c index 82a9e0b..a877747 100644
> > > --- a/drivers/vfio/vfio_iommu_type1.c
> > > +++ b/drivers/vfio/vfio_iommu_type1.c
> > > @@ -130,6 +130,33 @@ struct vfio_regions {
> > >  #define IS_IOMMU_CAP_DOMAIN_IN_CONTAINER(iommu)  \
> > >   (!list_empty(>domain_list))
> > >
> > > +struct domain_capsule {
> > > + struct iommu_domain *domain;
> > > + void *data;
> > > +};
> > > +
> > > +/* iommu->lock must be held */
> > > +static int vfio_iommu_for_each_dev(struct vfio_iommu *iommu,
> > > +   int (*fn)(struct device *dev, void *data),
> > > +   void *data)
> > > +{
> > > + struct domain_capsule dc = {.data = data};
> > > + struct vfio_domain *d;
> > > + struct vfio_group *g;
> > > + int ret = 0;
> > > +
> > > + list_for_each_entry(d, >domain_list, next) {
> > > + dc.domain = d->domain;
> > > + list_for_each_entry(g, >group_list, next) {
> > > + ret = iommu_group_for_each_dev(g->iommu_group,
> > > +, fn);
> > > + if (ret)
> > > + break;
> > > + }
> > > + }
> > > + return ret;
> > > +}
> > > +
> > >  static int put_pfn(unsigned long pfn, int prot);
> > >
> > >  /*
> > > @@ -2314,6 +2341,88 @@ static int
> > > vfio_iommu_info_add_nesting_cap(struct vfio_iommu *iommu,
> > >   return 0;
> > >  }
> > >
> > > +static int vfio_bind_gpasid_fn(struct device *dev, void *data) {
> > > + struct domain_capsule *dc = (struct domain_capsule *)data;
> > > + struct iommu_gpasid_bind_data *gbind_data =
> > > + (struct iommu_gpasid_bind_data *) dc->data;
> > > +
> >
> > In Jacob's vSVA iommu series, [PATCH 06/11]:
> >
> > +   /* REVISIT: upper layer/VFIO can track host process that bind
> the
> > PASID.
> > +* ioasid_set = mm might be sufficient for vfio to check pasid
> VMM
> > +* ownership.
> > +*/
> >
> > I asked him who exactly should be responsible for tracking the pasid
> ownership.
> > Although no response yet, I expect vfio/iommu can have a clear policy and
> also
> > documented here to provide consistent message.
> 
> yep.
> 
> > > + return iommu_sva_bind_gpasid(dc->domain, dev, gbind_data); }
> > > +
> > > +static int vfio_unbind_gpasid_fn(struct device *dev, void *data) {
> > > + struct domain_capsule *dc = 

Re: [PATCH 00/10] IOASID extensions for guest SVA

2020-04-01 Thread Jacob Pan
On Wed, 1 Apr 2020 16:03:01 +0200
Jean-Philippe Brucker  wrote:

> Hi Jacob,
> 
> On Wed, Mar 25, 2020 at 10:55:21AM -0700, Jacob Pan wrote:
> > IOASID was introduced in v5.5 as a generic kernel allocator service
> > for both PCIe Process Address Space ID (PASID) and ARM SMMU's Sub
> > Stream ID. In addition to basic ID allocation, ioasid_set was
> > introduced as a token that is shared by a group of IOASIDs. This
> > set token can be used for permission checking but lack of some
> > features needed by guest Shared Virtual Address (SVA). In addition,
> > IOASID support for life cycle management is needed among multiple
> > users.
> > 
> > This patchset introduces two extensions to the IOASID code,
> > 1. IOASID set operations
> > 2. Notifications for IOASID state synchronization  
> 
> My main concern with this series is patch 7 changing the spinlock to a
> mutex, which prevents SVA from calling ioasid_free() from the RCU
> callback of MMU notifiers. Could we use atomic notifiers, or do the
> FREE notification another way?
> 
Maybe I am looking at the wrong code, I thought
mmu_notifier_ops.free_notifier() is called outside spinlock with
call_srcu(), which will be invoked in the thread context.
in mmu_notifier.c mmu_notifier_put()
spin_unlock(>notifier_subscriptions->lock);

call_srcu(, >rcu, mmu_notifier_free_rcu);

Anyway, if we have to use atomic. I tried atomic notifier first, there
are two subscribers to the free event on x86.
1. IOMMU
2. KVM

For #1, the problem is that in the free operation, VT-d driver
needs to do a lot of clean up in thread context.
- hold a mutex to traverse a list of devices
- clear PASID entry and flush cache

For #2, KVM might be able to deal with spinlocks for updating VMCS
PASID translation table. +Hao

Perhaps two solutions I can think of:
1. Use a cyclic IOASID allocator. The main reason of clean up at free
is to prevent race with IOASID alloc. Similar to PID, 2M IOASID
will take long time overflow. Then we can use atomic notifier and a
deferred workqueue to do IOMMU cleanup. The downside is a large and
growing PASID table, may not be a performance issue since it has TLB.

2. Let VFIO ensure free always happen after unbind. Then there is no
need to do cleanup. But that requires VFIO to keep track of all the
PASIDs within each VM. When the VM terminates, VFIO is responsible for
the clean up. That was Yi's original proposal. I also tried to provide
an IOASID set iterator for VFIO to free the IOASIDs within each VM/set,
but the private data belongs to IOMMU driver.

Thanks,

Jacob

> Most of my other comments are just confusion on my part, I think, as I
> haven't yet properly looked through the VFIO and VT-d changes. I'd
> rather avoid the change to ioasid_find() if possible, because it adds
> a seemingly unnecessary indirection to the fast path, but it's
> probably insignificant.
> 
> Thanks,
> Jean
> 
> > 
> > Part #1:
> > IOASIDs used by each VM fits naturally into an ioasid_set. The usage
> > for per set management requires the following features:
> > 
> > - Quota enforcement - This is to prevent one VM from abusing the
> >   allocator to take all the system IOASIDs. Though VFIO layer can
> > also enforce the quota, but it cannot cover the usage with both
> > guest and host SVA on the same system.
> > 
> > - Stores guest IOASID-Host IOASID mapping within the set. To
> >   support live migration, IOASID namespace should be owned by the
> >   guest. This requires per IOASID set look up between guest and host
> >   IOASIDs. This patchset does not introduce non-identity guest-host
> >   IOASID lookup, we merely introduce the infrastructure in per set
> > data.
> > 
> > - Set level operations, e.g. when a guest terminates, it is likely
> > to free the entire set. Having a single place to manage the set
> > where the IOASIDs are stored makes iteration much easier.
> > 
> > 
> > New APIs are:
> > - void ioasid_install_capacity(ioasid_t total);
> >   Set the system capacity prior to any allocations. On x86, VT-d
> > driver calls this function to set max number of PASIDs, typically 1
> > million (20 bits).
> > 
> > - int ioasid_alloc_system_set(int quota);
> >   Host system has a default ioasid_set, during boot it is expected
> > that this default set is allocated with a reasonable quota, e.g.
> > PID_MAX. This default/system set is used for baremetal SVA.
> > 
> > - int ioasid_alloc_set(struct ioasid_set *token, ioasid_t quota, int
> > *sid);
> >   Allocate a new set with a token, returned sid (set ID) will be
> > used to allocate IOASIDs within the set. Allocation of IOASIDs
> > cannot exceed the quota.
> > 
> > - void ioasid_free_set(int sid, bool destroy_set);
> >   Free the entire set and notify all users with an option to destroy
> >   the set. Set ID can be used for allocation again if not destroyed.
> > 
> > - int ioasid_find_sid(ioasid_t ioasid);
> >   Look up the set ID from an ioasid. There is no reference held,
> >   assuming set has a single owner.
> > 

Re: [PATCH] iommu/vt-d: add NUMA awareness to intel_alloc_coherent()

2020-04-01 Thread Eric Dumazet



On 2/2/18 10:59 AM, Eric Dumazet wrote:
> On Fri, Feb 2, 2018 at 10:53 AM, Christoph Hellwig  wrote:
>> I've got patches pending to replace all that code with
>> dma_direct_alloc, which will do the right thing.  They were
>> submitted for 4.16, and I will resend them after -rc1.
> 
> I see, thanks Christoph !
> 

Hi Christoph 

It seems 4.16 has shipped ( :) ) , and intel_alloc_coherent() still has no NUMA 
awareness.

Should I respin https://lore.kernel.org/patchwork/patch/884326/

Thanks !
___
iommu mailing list
iommu@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/iommu


Re: [PATCH 01/10] iommu/ioasid: Introduce system-wide capacity

2020-04-01 Thread Jacob Pan
Hi Jean,

On Wed, 1 Apr 2020 15:45:52 +0200
Jean-Philippe Brucker  wrote:

> On Wed, Mar 25, 2020 at 10:55:22AM -0700, Jacob Pan wrote:
> > IOASID is a limited system-wide resource that can be allocated at
> > runtime. This limitation can be enumerated during boot. For
> > example, on x86 platforms, PCI Process Address Space ID (PASID)
> > allocation uses IOASID service. The number of supported PASID bits
> > are enumerated by extended capability register as defined in the
> > VT-d spec.
> > 
> > This patch adds a helper to set the system capacity, it expected to
> > be set during boot prior to any allocation request.  
> 
> This one-time setting is a bit awkward. Since multiple IOMMU drivers
> may be loaded, this can't be a module_init() thing. And we generally
> have multiple SMMU instances in the system. So we'd need to call
> install_capacity() only for the first SMMU loaded with an arbitrary
> 1<<20, even though each SMMU can support different numbers of PASID
> bits. Furthermore, modules such as iommu-sva will want to initialize
> their IOASID set at module_init(), which will happen before the SMMU
> can set up the capacity, so ioasid_alloc_set() will return an error.
> 
> How about hardcoding ioasid_capacity to 20 bits for now?  It's the
> PCIe limit and probably won't have to increase for a while.
> 
Sound good. Default to 20 bits but can be adjusted if needed.
 

> Thanks,
> Jean
> 
> > 
> > Signed-off-by: Jacob Pan 
> > ---
> >  drivers/iommu/ioasid.c | 15 +++
> >  include/linux/ioasid.h |  5 -
> >  2 files changed, 19 insertions(+), 1 deletion(-)
> > 
> > diff --git a/drivers/iommu/ioasid.c b/drivers/iommu/ioasid.c
> > index 0f8dd377aada..4026e52855b9 100644
> > --- a/drivers/iommu/ioasid.c
> > +++ b/drivers/iommu/ioasid.c
> > @@ -17,6 +17,21 @@ struct ioasid_data {
> > struct rcu_head rcu;
> >  };
> >  
> > +static ioasid_t ioasid_capacity;
> > +static ioasid_t ioasid_capacity_avail;
> > +
> > +/* System capacity can only be set once */
> > +void ioasid_install_capacity(ioasid_t total)
> > +{
> > +   if (ioasid_capacity) {
> > +   pr_warn("IOASID capacity already set at %d\n",
> > ioasid_capacity);
> > +   return;
> > +   }
> > +
> > +   ioasid_capacity = ioasid_capacity_avail = total;
> > +}
> > +EXPORT_SYMBOL_GPL(ioasid_install_capacity);
> > +
> >  /*
> >   * struct ioasid_allocator_data - Internal data structure to hold
> > information
> >   * about an allocator. There are two types of allocators:
> > diff --git a/include/linux/ioasid.h b/include/linux/ioasid.h
> > index 6f000d7a0ddc..9711fa0dc357 100644
> > --- a/include/linux/ioasid.h
> > +++ b/include/linux/ioasid.h
> > @@ -40,7 +40,7 @@ void *ioasid_find(struct ioasid_set *set,
> > ioasid_t ioasid, int ioasid_register_allocator(struct
> > ioasid_allocator_ops *allocator); void
> > ioasid_unregister_allocator(struct ioasid_allocator_ops
> > *allocator); int ioasid_set_data(ioasid_t ioasid, void *data); -
> > +void ioasid_install_capacity(ioasid_t total);
> >  #else /* !CONFIG_IOASID */
> >  static inline ioasid_t ioasid_alloc(struct ioasid_set *set,
> > ioasid_t min, ioasid_t max, void *private)
> > @@ -72,5 +72,8 @@ static inline int ioasid_set_data(ioasid_t
> > ioasid, void *data) return -ENOTSUPP;
> >  }
> >  
> > +static inline void ioasid_install_capacity(ioasid_t total)
> > +{
> > +}
> >  #endif /* CONFIG_IOASID */
> >  #endif /* __LINUX_IOASID_H */
> > -- 
> > 2.7.4
> >   

[Jacob Pan]
___
iommu mailing list
iommu@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/iommu


Re: [PATCH] vfio: Ignore -ENODEV when getting MSI cookie

2020-04-01 Thread Alex Williamson
On Wed,  1 Apr 2020 11:27:24 +0100
Andre Przywara  wrote:

> When we try to get an MSI cookie for a VFIO device, that can fail if
> CONFIG_IOMMU_DMA is not set. In this case iommu_get_msi_cookie() returns
> -ENODEV, and that should not be fatal.
> 
> Ignore that case and proceed with the initialisation.
> 
> This fixes VFIO with a platform device on the Calxeda Midway (no MSIs).
> 
> Fixes: f6810c15cf973f ("iommu/arm-smmu: Clean up early-probing workarounds")
> Signed-off-by: Andre Przywara 
> Acked-by: Robin Murphy 
> Reviewed-by: Eric Auger 
> ---
>  drivers/vfio/vfio_iommu_type1.c | 2 +-
>  1 file changed, 1 insertion(+), 1 deletion(-)
> 
> diff --git a/drivers/vfio/vfio_iommu_type1.c b/drivers/vfio/vfio_iommu_type1.c
> index 9fdfae1cb17a..85b32c325282 100644
> --- a/drivers/vfio/vfio_iommu_type1.c
> +++ b/drivers/vfio/vfio_iommu_type1.c
> @@ -1787,7 +1787,7 @@ static int vfio_iommu_type1_attach_group(void 
> *iommu_data,
>  
>   if (resv_msi) {
>   ret = iommu_get_msi_cookie(domain->domain, resv_msi_base);
> - if (ret)
> + if (ret && ret != -ENODEV)
>   goto out_detach;
>   }
>  

Applied to vfio next branch for v5.7.  Thanks,

Alex

___
iommu mailing list
iommu@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/iommu


Re: [PATCH 06/10] iommu/ioasid: Convert to set aware allocations

2020-04-01 Thread Jacob Pan
On Wed, 1 Apr 2020 15:55:25 +0200
Jean-Philippe Brucker  wrote:

> On Wed, Mar 25, 2020 at 10:55:27AM -0700, Jacob Pan wrote:
> > The current ioasid_alloc function takes a token/ioasid_set then
> > record it on the IOASID being allocated. There is no alloc/free on
> > the ioasid_set.
> > 
> > With the IOASID set APIs, callers must allocate an ioasid_set before
> > allocate IOASIDs within the set. Quota and other ioasid_set level
> > activities can then be enforced.
> > 
> > This patch converts existing API to the new ioasid_set model.
> > 
> > Signed-off-by: Liu Yi L 
> > Signed-off-by: Jacob Pan   
> 
> [...]
> 
> > @@ -379,6 +391,9 @@ ioasid_t ioasid_alloc(struct ioasid_set *set,
> > ioasid_t min, ioasid_t max, }
> > data->id = id;
> >  
> > +   /* Store IOASID in the per set data */
> > +   xa_store(>xa, id, data, GFP_KERNEL);  
> 
> I couldn't figure out why you're maintaining an additional xarray for
> each set. We're already storing that data in active_allocator->xa,
> why the duplication?  If it's for the gPASID -> hPASID translation
> mentioned by the cover letter, maybe you could add this xa when
> introducing that change?
> 
Sounds good. I will add that later. I was hoping to get common code in
place.
___
iommu mailing list
iommu@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/iommu


Re: [PATCH V10 05/11] iommu/vt-d: Add nested translation helper function

2020-04-01 Thread Jacob Pan
On Sun, 29 Mar 2020 13:35:15 +0200
Auger Eric  wrote:

> Hi Jacob,
> 
> On 3/21/20 12:27 AM, Jacob Pan wrote:
> > Nested translation mode is supported in VT-d 3.0 Spec.CH 3.8.
> > With PASID granular translation type set to 0x11b, translation
> > result from the first level(FL) also subject to a second level(SL)
> > page table translation. This mode is used for SVA virtualization,
> > where FL performs guest virtual to guest physical translation and
> > SL performs guest physical to host physical translation.
> > 
> > This patch adds a helper function for setting up nested translation
> > where second level comes from a domain and first level comes from
> > a guest PGD.
> > 
> > Signed-off-by: Jacob Pan 
> > Signed-off-by: Liu, Yi L 
> > ---
> >  drivers/iommu/intel-pasid.c | 240
> > +++-
> > drivers/iommu/intel-pasid.h |  12 +++ include/linux/intel-iommu.h
> > |   3 + 3 files changed, 252 insertions(+), 3 deletions(-)
> > 
> > diff --git a/drivers/iommu/intel-pasid.c
> > b/drivers/iommu/intel-pasid.c index 9bdb7ee228b6..10c7856afc6b
> > 100644 --- a/drivers/iommu/intel-pasid.c
> > +++ b/drivers/iommu/intel-pasid.c
> > @@ -359,6 +359,76 @@ pasid_set_flpm(struct pasid_entry *pe, u64
> > value) pasid_set_bits(>val[2], GENMASK_ULL(3, 2), value << 2);
> >  }
> >  
> > +/*
> > + * Setup the Extended Memory Type(EMT) field (Bits 91-93)
> > + * of a scalable mode PASID entry.
> > + */
> > +static inline void
> > +pasid_set_emt(struct pasid_entry *pe, u64 value)
> > +{
> > +   pasid_set_bits(>val[1], GENMASK_ULL(29, 27), value <<
> > 27); +}
> > +
> > +/*
> > + * Setup the Page Attribute Table (PAT) field (Bits 96-127)
> > + * of a scalable mode PASID entry.
> > + */
> > +static inline void
> > +pasid_set_pat(struct pasid_entry *pe, u64 value)
> > +{
> > +   pasid_set_bits(>val[1], GENMASK_ULL(63, 32), value <<
> > 32); +}
> > +
> > +/*
> > + * Setup the Cache Disable (CD) field (Bit 89)
> > + * of a scalable mode PASID entry.
> > + */
> > +static inline void
> > +pasid_set_cd(struct pasid_entry *pe)
> > +{
> > +   pasid_set_bits(>val[1], 1 << 25, 1 << 25);
> > +}
> > +
> > +/*
> > + * Setup the Extended Memory Type Enable (EMTE) field (Bit 90)
> > + * of a scalable mode PASID entry.
> > + */
> > +static inline void
> > +pasid_set_emte(struct pasid_entry *pe)
> > +{
> > +   pasid_set_bits(>val[1], 1 << 26, 1 << 26);
> > +}
> > +
> > +/*
> > + * Setup the Extended Access Flag Enable (EAFE) field (Bit 135)
> > + * of a scalable mode PASID entry.
> > + */
> > +static inline void
> > +pasid_set_eafe(struct pasid_entry *pe)
> > +{
> > +   pasid_set_bits(>val[2], 1 << 7, 1 << 7);
> > +}
> > +
> > +/*
> > + * Setup the Page-level Cache Disable (PCD) field (Bit 95)
> > + * of a scalable mode PASID entry.
> > + */
> > +static inline void
> > +pasid_set_pcd(struct pasid_entry *pe)
> > +{
> > +   pasid_set_bits(>val[1], 1 << 31, 1 << 31);
> > +}
> > +
> > +/*
> > + * Setup the Page-level Write-Through (PWT)) field (Bit 94)
> > + * of a scalable mode PASID entry.
> > + */
> > +static inline void
> > +pasid_set_pwt(struct pasid_entry *pe)
> > +{
> > +   pasid_set_bits(>val[1], 1 << 30, 1 << 30);
> > +}
> > +
> >  static void
> >  pasid_cache_invalidation_with_pasid(struct intel_iommu *iommu,
> > u16 did, int pasid)
> > @@ -492,7 +562,7 @@ int intel_pasid_setup_first_level(struct
> > intel_iommu *iommu,
> > pasid_set_page_snoop(pte, !!ecap_smpwc(iommu->ecap)); 
> > /* Setup Present and PASID Granular Transfer Type: */
> > -   pasid_set_translation_type(pte, 1);
> > +   pasid_set_translation_type(pte, PASID_ENTRY_PGTT_FL_ONLY);
> > pasid_set_present(pte);
> > pasid_flush_caches(iommu, pte, pasid, did);
> >  
> > @@ -564,7 +634,7 @@ int intel_pasid_setup_second_level(struct
> > intel_iommu *iommu, pasid_set_domain_id(pte, did);
> > pasid_set_slptr(pte, pgd_val);
> > pasid_set_address_width(pte, agaw);
> > -   pasid_set_translation_type(pte, 2);
> > +   pasid_set_translation_type(pte, PASID_ENTRY_PGTT_SL_ONLY);
> > pasid_set_fault_enable(pte);
> > pasid_set_page_snoop(pte, !!ecap_smpwc(iommu->ecap));
> >  
> > @@ -598,7 +668,7 @@ int intel_pasid_setup_pass_through(struct
> > intel_iommu *iommu, pasid_clear_entry(pte);
> > pasid_set_domain_id(pte, did);
> > pasid_set_address_width(pte, iommu->agaw);
> > -   pasid_set_translation_type(pte, 4);
> > +   pasid_set_translation_type(pte, PASID_ENTRY_PGTT_PT);
> > pasid_set_fault_enable(pte);
> > pasid_set_page_snoop(pte, !!ecap_smpwc(iommu->ecap));  
> 
> All above looks good to me
> >  
> > @@ -612,3 +682,167 @@ int intel_pasid_setup_pass_through(struct
> > intel_iommu *iommu, 
> > return 0;
> >  }
> > +
> > +static int intel_pasid_setup_bind_data(struct intel_iommu *iommu,
> > +   struct pasid_entry *pte,
> > +   struct iommu_gpasid_bind_data_vtd
> > *pasid_data) +{
> > +   /*
> > +* Not all guest PASID table entry 

Re: [PATCH V10 06/11] iommu/vt-d: Add bind guest PASID support

2020-04-01 Thread Jacob Pan
On Tue, 31 Mar 2020 03:43:39 +
"Tian, Kevin"  wrote:

> > > >  struct intel_svm_dev {
> > > > @@ -698,9 +700,13 @@ struct intel_svm_dev {
> > > >  struct intel_svm {
> > > > struct mmu_notifier notifier;
> > > > struct mm_struct *mm;
> > > > +
> > > > struct intel_iommu *iommu;
> > > > int flags;
> > > > int pasid;
> > > > +   int gpasid; /* Guest PASID in case of vSVA bind with
> > > > non-identity host
> > > > +* to guest PASID mapping.
> > > > +*/  
> > >
> > > we don't need to highlight identity or non-identity thing, since
> > > either way shares the same infrastructure here and it is not the
> > > knowledge that the kernel driver should assume
> > >  
> > Sorry, I don't get your point.
> > 
> > What I meant was that this field "gpasid" is only used for
> > non-identity case. For identity case, we don't have
> > SVM_FLAG_GUEST_PASID.  
> 
> what's the problem if a guest tries to set gpasid even in identity
> case? do you want to add check to reject it? Also I remember we
> discussed before that we want to provide a consistent interface 
> to other consumer e.g. KVM to setup VMCS PASID translation table.
> In that case, regardless of identity or non-identity, we need provide
> such mapping info.
The solution is in flux a little. For KVM to set up VMCS, we are
planning to use IOASID set private ID as guest PASID. So this part of
the code will go away, i.e. G-H PASID mapping will no longer stored in
IOMMU driver. Perhaps we can address this after the transition?
___
iommu mailing list
iommu@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/iommu


Re: [PATCH V10 08/11] iommu/vt-d: Add svm/sva invalidate function

2020-04-01 Thread Jacob Pan
On Wed, 1 Apr 2020 09:32:37 +0200
Auger Eric  wrote:

> >> devtlb
> >> descriptor, that is why Eric suggests {0, 0, 1}.  
> > 
> > I think it should be {0, 0, 1} :-) addr field and S field are must,
> > pasid field depends on G bit.  
> 
> On my side, I understood from the spec that addr/S are always used
> whatever the granularity, hence the above suggestion.
> 
> As a comparison, for PASID based IOTLB invalidation, it is clearly
> stated that if G matches PASID selective invalidation, address field
> is ignored. This is not written that way for PASID-based device TLB
> inv.
> > 
I misread the S bit. It all makes sense now. Thanks for the proposal
and explanation.

Jacob
___
iommu mailing list
iommu@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/iommu


Re: [PATCH V10 08/11] iommu/vt-d: Add svm/sva invalidate function

2020-04-01 Thread Jacob Pan
On Wed, 1 Apr 2020 06:57:42 +
"Liu, Yi L"  wrote:

> > From: Tian, Kevin 
> > Sent: Wednesday, April 1, 2020 2:24 PM
> > To: Jacob Pan 
> > Subject: RE: [PATCH V10 08/11] iommu/vt-d: Add svm/sva invalidate
> > function 
> > > From: Jacob Pan 
> > > Sent: Wednesday, April 1, 2020 2:14 AM
> > >
> > > On Sat, 28 Mar 2020 10:01:42 +
> > > "Tian, Kevin"  wrote:
> > >  
> > > > > From: Jacob Pan 
> > > > > Sent: Saturday, March 21, 2020 7:28 AM
> > > > >
> > > > > When Shared Virtual Address (SVA) is enabled for a guest OS
> > > > > via vIOMMU, we need to provide invalidation support at IOMMU
> > > > > API and driver level. This patch adds Intel VT-d specific
> > > > > function to implement iommu passdown invalidate API for
> > > > > shared virtual address.
> > > > >
> > > > > The use case is for supporting caching structure invalidation
> > > > > of assigned SVM capable devices. Emulated IOMMU exposes
> > > > > queue  
> > > >
> > > > emulated IOMMU -> vIOMMU, since virito-iommu could use the
> > > > interface as well.
> > > >  
> > > True, but it does not invalidate this statement about emulated
> > > IOMMU. I will add another statement saying "the same interface
> > > can be used for virtio-IOMMU as well". OK?  
> > 
> > sure
> >   
> > >  
> > > > > invalidation capability and passes down all descriptors from
> > > > > the guest to the physical IOMMU.
> > > > >
> > > > > The assumption is that guest to host device ID mapping should
> > > > > be resolved prior to calling IOMMU driver. Based on the
> > > > > device handle, host IOMMU driver can replace certain fields
> > > > > before submit to the invalidation queue.
> > > > >
> > > > > ---
> > > > > v7 review fixed in v10
> > > > > ---
> > > > >
> > > > > Signed-off-by: Jacob Pan 
> > > > > Signed-off-by: Ashok Raj 
> > > > > Signed-off-by: Liu, Yi L 
> > > > > ---
> > > > >  drivers/iommu/intel-iommu.c | 182
> > > > > 
> > > > >  1 file changed, 182 insertions(+)
> > > > >
> > > > > diff --git a/drivers/iommu/intel-iommu.c
> > > > > b/drivers/iommu/intel-iommu.c index b1477cd423dd..a76afb0fd51a
> > > > > 100644 --- a/drivers/iommu/intel-iommu.c
> > > > > +++ b/drivers/iommu/intel-iommu.c
> > > > > @@ -5619,6 +5619,187 @@ static void
> > > > > intel_iommu_aux_detach_device(struct iommu_domain *domain,
> > > > >   aux_domain_remove_dev(to_dmar_domain(domain), dev);
> > > > >  }
> > > > >
> > > > > +/*
> > > > > + * 2D array for converting and sanitizing IOMMU generic TLB
> > > > > granularity to
> > > > > + * VT-d granularity. Invalidation is typically included in
> > > > > the unmap operation
> > > > > + * as a result of DMA or VFIO unmap. However, for assigned
> > > > > devices guest
> > > > > + * owns the first level page tables. Invalidations of
> > > > > translation caches in the
> > > > > + * guest are trapped and passed down to the host.
> > > > > + *
> > > > > + * vIOMMU in the guest will only expose first level page
> > > > > tables, therefore
> > > > > + * we do not include IOTLB granularity for request without
> > > > > PASID (second level).  
> > > >
> > > > I would revise above as "We do not support IOTLB granularity for
> > > > request without PASID (second level), therefore any vIOMMU
> > > > implementation that exposes the SVA capability to the guest
> > > > should only expose the first level page tables, implying all
> > > > invalidation requests from the guest will include a valid PASID"
> > > >  
> > > Sounds good.
> > >  
> > > > > + *
> > > > > + * For example, to find the VT-d granularity encoding for
> > > > > IOTLB
> > > > > + * type and page selective granularity within PASID:
> > > > > + * X: indexed by iommu cache type
> > > > > + * Y: indexed by enum iommu_inv_granularity
> > > > > + * [IOMMU_CACHE_INV_TYPE_IOTLB][IOMMU_INV_GRANU_ADDR]
> > > > > + *
> > > > > + * Granu_map array indicates validity of the table. 1:
> > > > > valid, 0: invalid
> > > > > + *
> > > > > + */
> > > > > +const static int
> > > > >  
> > > inv_type_granu_map[IOMMU_CACHE_INV_TYPE_NR][IOMMU_INV_GRANU_  
> > > > > NR] = {
> > > > > + /*
> > > > > +  * PASID based IOTLB invalidation: PASID selective
> > > > > (per PASID),
> > > > > +  * page selective (address granularity)
> > > > > +  */
> > > > > + {0, 1, 1},
> > > > > + /* PASID based dev TLBs, only support all PASIDs or
> > > > > single PASID */
> > > > > + {1, 1, 0},  
> > > >
> > > > Is this combination correct? when single PASID is being
> > > > specified, it is essentially a page-selective invalidation
> > > > since you need provide Address and Size.
> > > >  
> > > This is for translation between generic UAPI granu to VT-d granu,
> > > it has nothing to do with address and size.  
> > 
> > Generic UAPI defines three granularities: domain, pasid and addr.
> > from the definition domain applies all entries related to did, pasid
> > applies to all entries related to pasid, while addr is specific for
> > a range.
> > 
> > 

Re: [RFC PATCH v2] iommu/virtio: Use page size bitmap supported by endpoint

2020-04-01 Thread Jean-Philippe Brucker
On Wed, Apr 01, 2020 at 02:00:13PM +0100, Robin Murphy wrote:
> On 2020-04-01 12:38 pm, Bharat Bhushan wrote:
> > Different endpoint can support different page size, probe
> > endpoint if it supports specific page size otherwise use
> > global page sizes.
> > 
> > Signed-off-by: Bharat Bhushan 
> > ---
> >   drivers/iommu/virtio-iommu.c  | 33 +++
> >   include/uapi/linux/virtio_iommu.h |  7 +++
> >   2 files changed, 36 insertions(+), 4 deletions(-)
> > 
> > diff --git a/drivers/iommu/virtio-iommu.c b/drivers/iommu/virtio-iommu.c
> > index cce329d71fba..c794cb5b7b3e 100644
> > --- a/drivers/iommu/virtio-iommu.c
> > +++ b/drivers/iommu/virtio-iommu.c
> > @@ -78,6 +78,7 @@ struct viommu_endpoint {
> > struct viommu_dev   *viommu;
> > struct viommu_domain*vdomain;
> > struct list_headresv_regions;
> > +   u64 pgsize_bitmap;
> >   };
> >   struct viommu_request {
> > @@ -415,6 +416,20 @@ static int viommu_replay_mappings(struct viommu_domain 
> > *vdomain)
> > return ret;
> >   }
> > +static int viommu_set_pgsize_bitmap(struct viommu_endpoint *vdev,
> > +   struct virtio_iommu_probe_pgsize_mask *mask,
> > +   size_t len)
> > +
> > +{
> > +   u64 pgsize_bitmap = le64_to_cpu(mask->pgsize_bitmap);
> > +
> > +   if (len < sizeof(*mask))
> > +   return -EINVAL;
> > +
> > +   vdev->pgsize_bitmap = pgsize_bitmap;
> > +   return 0;
> > +}
> > +
> >   static int viommu_add_resv_mem(struct viommu_endpoint *vdev,
> >struct virtio_iommu_probe_resv_mem *mem,
> >size_t len)
> > @@ -494,11 +509,13 @@ static int viommu_probe_endpoint(struct viommu_dev 
> > *viommu, struct device *dev)
> > while (type != VIRTIO_IOMMU_PROBE_T_NONE &&
> >cur < viommu->probe_size) {
> > len = le16_to_cpu(prop->length) + sizeof(*prop);
> > -

Whitespace change

> > switch (type) {
> > case VIRTIO_IOMMU_PROBE_T_RESV_MEM:
> > ret = viommu_add_resv_mem(vdev, (void *)prop, len);
> > break;
> > +   case VIRTIO_IOMMU_PROBE_T_PAGE_SIZE_MASK:
> > +   ret = viommu_set_pgsize_bitmap(vdev, (void *)prop, len);
> > +   break;
> > default:
> > dev_err(dev, "unknown viommu prop 0x%x\n", type);
> > }
> > @@ -607,16 +624,23 @@ static struct iommu_domain 
> > *viommu_domain_alloc(unsigned type)
> > return >domain;
> >   }
> > -static int viommu_domain_finalise(struct viommu_dev *viommu,
> > +static int viommu_domain_finalise(struct viommu_endpoint *vdev,
> >   struct iommu_domain *domain)
> >   {
> > int ret;
> > struct viommu_domain *vdomain = to_viommu_domain(domain);
> > +   struct viommu_dev *viommu = vdev->viommu;
> > vdomain->viommu = viommu;
> > vdomain->map_flags  = viommu->map_flags;
> > -   domain->pgsize_bitmap   = viommu->pgsize_bitmap;
> > +   /* Devices in same domain must support same size pages */
> 
> AFAICS what the code appears to do is enforce that the first endpoint
> attached to any domain has the same pgsize_bitmap as the most recently
> probed viommu_dev instance, then ignore any subsequent endpoints attached to
> the same domain. Thus I'm not sure that comment is accurate.
> 

Yes viommu_domain_finalise() is only called once. What I had in mind is
something like:

 8< 
diff --git a/drivers/iommu/virtio-iommu.c b/drivers/iommu/virtio-iommu.c
index 750f69c49b95..8303b7b513ff 100644
--- a/drivers/iommu/virtio-iommu.c
+++ b/drivers/iommu/virtio-iommu.c
@@ -639,6 +639,29 @@ static int viommu_domain_finalise(struct viommu_endpoint 
*vdev,
return 0;
 }
 
+/*
+ * Check whether the endpoint's capabilities are compatible with other 
endpoints
+ * in the domain. Report any inconsistency.
+ */
+static bool viommu_endpoint_is_compatible(struct viommu_endpoint *vdev,
+ struct viommu_domain *vdomain)
+{
+   struct device *dev = vdev->dev;
+
+   if (vdomain->viommu != vdev->viommu) {
+   dev_err(dev, "cannot attach to foreign vIOMMU\n");
+   return false;
+   }
+
+   if (vdomain->domain.pgsize_bitmap != vdev->pgsize_bitmap) {
+   dev_err(dev, "incompatible domain bitmap 0x%lx != 0x%lx\n",
+   vdomain->domain.pgsize_bitmap, vdev->pgsize_bitmap);
+   return false;
+   }
+
+   return true;
+}
+
 static void viommu_domain_free(struct iommu_domain *domain)
 {
struct viommu_domain *vdomain = to_viommu_domain(domain);
@@ -670,9 +693,8 @@ static int viommu_attach_dev(struct iommu_domain *domain, 
struct device *dev)
 * owns it.
 */
ret = viommu_domain_finalise(vdev, domain);
-   } else if 

Re: [PATCH V10 11/11] iommu/vt-d: Add custom allocator for IOASID

2020-04-01 Thread Jacob Pan
On Sat, 28 Mar 2020 10:22:41 +
"Tian, Kevin"  wrote:

> > From: Jacob Pan 
> > Sent: Saturday, March 21, 2020 7:28 AM
> > 
> > When VT-d driver runs in the guest, PASID allocation must be
> > performed via virtual command interface. This patch registers a
> > custom IOASID allocator which takes precedence over the default
> > XArray based allocator. The resulting IOASID allocation will always
> > come from the host. This ensures that PASID namespace is system-
> > wide.
> > 
> > Signed-off-by: Lu Baolu 
> > Signed-off-by: Liu, Yi L 
> > Signed-off-by: Jacob Pan 
> > ---
> >  drivers/iommu/intel-iommu.c | 84
> > +
> >  include/linux/intel-iommu.h |  2 ++
> >  2 files changed, 86 insertions(+)
> > 
> > diff --git a/drivers/iommu/intel-iommu.c
> > b/drivers/iommu/intel-iommu.c index a76afb0fd51a..c1c0b0fb93c3
> > 100644 --- a/drivers/iommu/intel-iommu.c
> > +++ b/drivers/iommu/intel-iommu.c
> > @@ -1757,6 +1757,9 @@ static void free_dmar_iommu(struct intel_iommu
> > *iommu)
> > if (ecap_prs(iommu->ecap))
> > intel_svm_finish_prq(iommu);
> > }
> > +   if (ecap_vcs(iommu->ecap) && vccap_pasid(iommu->vccap))
> > +
> > ioasid_unregister_allocator(>pasid_allocator); +
> >  #endif
> >  }
> > 
> > @@ -3291,6 +3294,84 @@ static int copy_translation_tables(struct
> > intel_iommu *iommu)
> > return ret;
> >  }
> > 
> > +#ifdef CONFIG_INTEL_IOMMU_SVM
> > +static ioasid_t intel_ioasid_alloc(ioasid_t min, ioasid_t max,
> > void *data)  
> 
> the name is too generic... can we add vcmd in the name to clarify
> its purpose, e.g. intel_vcmd_ioasid_alloc?
> 
I feel the intel_ prefix is a natural extension of a generic API, we do
that for other IOMMU APIs, right?

> > +{
> > +   struct intel_iommu *iommu = data;
> > +   ioasid_t ioasid;
> > +
> > +   if (!iommu)
> > +   return INVALID_IOASID;
> > +   /*
> > +* VT-d virtual command interface always uses the full 20
> > bit
> > +* PASID range. Host can partition guest PASID range based
> > on
> > +* policies but it is out of guest's control.
> > +*/
> > +   if (min < PASID_MIN || max > intel_pasid_max_id)
> > +   return INVALID_IOASID;
> > +
> > +   if (vcmd_alloc_pasid(iommu, ))
> > +   return INVALID_IOASID;
> > +
> > +   return ioasid;
> > +}
> > +
> > +static void intel_ioasid_free(ioasid_t ioasid, void *data)
> > +{
> > +   struct intel_iommu *iommu = data;
> > +
> > +   if (!iommu)
> > +   return;
> > +   /*
> > +* Sanity check the ioasid owner is done at upper layer,
> > e.g. VFIO
> > +* We can only free the PASID when all the devices are
> > unbound.
> > +*/
> > +   if (ioasid_find(NULL, ioasid, NULL)) {
> > +   pr_alert("Cannot free active IOASID %d\n", ioasid);
> > +   return;
> > +   }  
> 
> However the sanity check is not done in default_free. Is there a
> reason why using vcmd adds such  new requirement?
> 
Since we don't support nested guest. This vcmd allocator is only used
by the guest IOMMU driver not VFIO. We expect IOMMU driver to have
control of the free()/unbind() ordering.

For default_free, it can come from user space and host VFIO which can
be out of order. But we will solve that issue with the blocking
notifier.

> > +   vcmd_free_pasid(iommu, ioasid);
> > +}
> > +
> > +static void register_pasid_allocator(struct intel_iommu *iommu)
> > +{
> > +   /*
> > +* If we are running in the host, no need for custom
> > allocator
> > +* in that PASIDs are allocated from the host system-wide.
> > +*/
> > +   if (!cap_caching_mode(iommu->cap))
> > +   return;  
> 
> is it more accurate to check against vcmd capability?
> 
I think this is sufficient. The spec says if vcmd is present, we must
use it but not the other way.

> > +
> > +   if (!sm_supported(iommu)) {
> > +   pr_warn("VT-d Scalable Mode not enabled, no PASID
> > allocation\n");
> > +   return;
> > +   }
> > +
> > +   /*
> > +* Register a custom PASID allocator if we are running in
> > a guest,
> > +* guest PASID must be obtained via virtual command
> > interface.
> > +* There can be multiple vIOMMUs in each guest but only one
> > allocator
> > +* is active. All vIOMMU allocators will eventually be
> > calling the same  
> 
> which one? the first or last?
> 
All allocators share the same ops, so first=last. IOASID code will
inspect the ops function and see if they are shared with others then
use the same ops.

> > +* host allocator.
> > +*/
> > +   if (ecap_vcs(iommu->ecap) && vccap_pasid(iommu->vccap)) {
> > +   pr_info("Register custom PASID allocator\n");
> > +   iommu->pasid_allocator.alloc = intel_ioasid_alloc;
> > +   iommu->pasid_allocator.free = intel_ioasid_free;
> > +   iommu->pasid_allocator.pdata = (void *)iommu;
> > +   if
> > (ioasid_register_allocator(>pasid_allocator)) {
> > +   pr_warn("Custom PASID allocator 

Re: [PATCH 00/10] IOASID extensions for guest SVA

2020-04-01 Thread Jean-Philippe Brucker
Hi Jacob,

On Wed, Mar 25, 2020 at 10:55:21AM -0700, Jacob Pan wrote:
> IOASID was introduced in v5.5 as a generic kernel allocator service for
> both PCIe Process Address Space ID (PASID) and ARM SMMU's Sub Stream
> ID. In addition to basic ID allocation, ioasid_set was introduced as a
> token that is shared by a group of IOASIDs. This set token can be used
> for permission checking but lack of some features needed by guest Shared
> Virtual Address (SVA). In addition, IOASID support for life cycle
> management is needed among multiple users.
> 
> This patchset introduces two extensions to the IOASID code,
> 1. IOASID set operations
> 2. Notifications for IOASID state synchronization

My main concern with this series is patch 7 changing the spinlock to a
mutex, which prevents SVA from calling ioasid_free() from the RCU callback
of MMU notifiers. Could we use atomic notifiers, or do the FREE
notification another way?

Most of my other comments are just confusion on my part, I think, as I
haven't yet properly looked through the VFIO and VT-d changes. I'd rather
avoid the change to ioasid_find() if possible, because it adds a seemingly
unnecessary indirection to the fast path, but it's probably insignificant.

Thanks,
Jean

> 
> Part #1:
> IOASIDs used by each VM fits naturally into an ioasid_set. The usage
> for per set management requires the following features:
> 
> - Quota enforcement - This is to prevent one VM from abusing the
>   allocator to take all the system IOASIDs. Though VFIO layer can also
>   enforce the quota, but it cannot cover the usage with both guest and
>   host SVA on the same system.
> 
> - Stores guest IOASID-Host IOASID mapping within the set. To
>   support live migration, IOASID namespace should be owned by the
>   guest. This requires per IOASID set look up between guest and host
>   IOASIDs. This patchset does not introduce non-identity guest-host
>   IOASID lookup, we merely introduce the infrastructure in per set data.
> 
> - Set level operations, e.g. when a guest terminates, it is likely to
> free the entire set. Having a single place to manage the set where the
> IOASIDs are stored makes iteration much easier.
> 
> 
> New APIs are:
> - void ioasid_install_capacity(ioasid_t total);
>   Set the system capacity prior to any allocations. On x86, VT-d driver
>   calls this function to set max number of PASIDs, typically 1 million
>   (20 bits).
> 
> - int ioasid_alloc_system_set(int quota);
>   Host system has a default ioasid_set, during boot it is expected that
>   this default set is allocated with a reasonable quota, e.g. PID_MAX.
>   This default/system set is used for baremetal SVA.
> 
> - int ioasid_alloc_set(struct ioasid_set *token, ioasid_t quota, int
> *sid);
>   Allocate a new set with a token, returned sid (set ID) will be used
>   to allocate IOASIDs within the set. Allocation of IOASIDs cannot
>   exceed the quota.
> 
> - void ioasid_free_set(int sid, bool destroy_set);
>   Free the entire set and notify all users with an option to destroy
>   the set. Set ID can be used for allocation again if not destroyed.
> 
> - int ioasid_find_sid(ioasid_t ioasid);
>   Look up the set ID from an ioasid. There is no reference held,
>   assuming set has a single owner.
> 
> - int ioasid_adjust_set(int sid, int quota);
>   Change the quota of the set, new quota cannot be less than the number
>   of IOASIDs already allocated within the set. This is useful when
>   IOASID resource needs to be balanced among VMs.
> 
> Part #2
> Notification service. Since IOASIDs are used by many consumers that
> follow publisher-subscriber pattern, notification is a natural choice
> to keep states synchronized. For example, on x86 system, guest PASID
> allocation and bind call results in VFIO IOCTL that can add and change
> guest-host PASID states. At the same time, IOMMU driver and KVM need to
> maintain its own PASID contexts. In this case, VFIO is the publisher
> within the kernel, IOMMU driver and KVM are the subscribers.
> 
> This patchset introduces a global blocking notifier chain and APIs to
> operate on. Not all events nor all IOASIDs are of interests to all
> subscribers. e.g. KVM is only interested in the IOASIDs within its set.
> IOMMU driver is not ioasid_set aware. A further optimization could be
> having both global and per set notifier. But consider the infrequent
> nature of bind/unbind and relatively long process life cycle, this
> optimization may not be needed at this time.
> 
> To register/unregister notification blocks, use these two APIs:
> - int ioasid_add_notifier(struct notifier_block *nb);
> - void ioasid_remove_notifier(struct notifier_block *nb)
> 
> To send notification on an IOASID with one of the commands (FREE,
> BIND/UNBIND, etc.), use:
> - int ioasid_notify(ioasid_t id, enum ioasid_notify_val cmd);
> 
> This work is a result of collaboration with many people:
> Liu, Yi L 
> Wu Hao 
> Ashok Raj 
> Kevin Tian 
> 
> Thanks,
> 
> Jacob
> 
> Jacob Pan (10):
>   

Re: [PATCH 08/10] iommu/ioasid: Introduce notifier APIs

2020-04-01 Thread Jean-Philippe Brucker
On Wed, Mar 25, 2020 at 10:55:29AM -0700, Jacob Pan wrote:
> IOASID users fit into the publisher-subscriber pattern, a system wide
> blocking notifier chain can be used to inform subscribers of state
> changes. Notifier mechanism also abstracts publisher from knowing the
> private context each subcriber may have.
> 
> This patch adds APIs and a global notifier chain, a further optimization
> might be per set notifier for ioasid_set aware users.
> 
> Usage example:
> KVM register notifier block such that it can keep its guest-host PASID
> translation table in sync with any IOASID updates.

When you talk about KVM, is it for

  [PATCH 0/7] x86: tag application address space for devices

or something else as well? (I don't see mentions of KVM in that series)

> 
> VFIO publish IOASID change by performing alloc/free, bind/unbind
> operations.

I was rather seeing IOASID as the end of the VFIO-IOMMU-IOASID chain,
putting it in the middle complicates locking. If you only need to FREE
notifier for this calse, maybe VFIO could talk directly to the IOMMU
driver before freeing an IOASID?  gpasid_unbind() should already clear the
PASID contexts, no?

Thanks,
Jean

> IOMMU driver gets notified when IOASID is freed by VFIO or core mm code
> such that PASID context can be cleaned up.
> 
> Signed-off-by: Liu Yi L 
> Signed-off-by: Jacob Pan 
___
iommu mailing list
iommu@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/iommu


Re: [PATCH 07/10] iommu/ioasid: Use mutex instead of spinlock

2020-04-01 Thread Jean-Philippe Brucker
On Wed, Mar 25, 2020 at 10:55:28AM -0700, Jacob Pan wrote:
> Each IOASID or set could have multiple users with its own HW context
> to maintain. Often times access to the HW context requires thread context.
> For example, consumers of IOASIDs can register notification blocks to
> sync up its states. Having an atomic notifier is not feasible for these
> update operations.
> 
> This patch converts allocator lock from spinlock to mutex in preparation
> for IOASID notifier.

Unfortunately this doesn't work for SVA, which needs to call ioasid_free()
from the RCU callback of mmu_notifier_put(), which cannot sleep. We're
relying on MMU notifers this way to ensure that there is a single IOASID
per mm.

Thanks,
Jean
___
iommu mailing list
iommu@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/iommu


Re: [PATCH 06/10] iommu/ioasid: Convert to set aware allocations

2020-04-01 Thread Jean-Philippe Brucker
On Wed, Mar 25, 2020 at 10:55:27AM -0700, Jacob Pan wrote:
> The current ioasid_alloc function takes a token/ioasid_set then record it
> on the IOASID being allocated. There is no alloc/free on the ioasid_set.
> 
> With the IOASID set APIs, callers must allocate an ioasid_set before
> allocate IOASIDs within the set. Quota and other ioasid_set level
> activities can then be enforced.
> 
> This patch converts existing API to the new ioasid_set model.
> 
> Signed-off-by: Liu Yi L 
> Signed-off-by: Jacob Pan 

[...]

> @@ -379,6 +391,9 @@ ioasid_t ioasid_alloc(struct ioasid_set *set, ioasid_t 
> min, ioasid_t max,
>   }
>   data->id = id;
>  
> + /* Store IOASID in the per set data */
> + xa_store(>xa, id, data, GFP_KERNEL);

I couldn't figure out why you're maintaining an additional xarray for each
set. We're already storing that data in active_allocator->xa, why the
duplication?  If it's for the gPASID -> hPASID translation mentioned by
the cover letter, maybe you could add this xa when introducing that
change?

Thanks,
Jean

___
iommu mailing list
iommu@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/iommu


Re: [PATCH 05/10] iommu/ioasid: Create an IOASID set for host SVA use

2020-04-01 Thread Jean-Philippe Brucker
On Wed, Mar 25, 2020 at 10:55:26AM -0700, Jacob Pan wrote:
> Bare metal SVA allocates IOASIDs for native process addresses. This
> should be separated from VM allocated IOASIDs thus under its own set.
> 
> This patch creates a system IOASID set with its quota set to PID_MAX.
> This is a reasonable default in that SVM capable devices can only bind
> to limited user processes.

Yes realistically there won't be more than PID_MAX_DEFAULT=0x8000 bound
address spaces. My machine uses a PID_MAX of 4 million though, so in
theory more than 0x8000 processes may want a bond. On Arm the limit of
shared contexts per VM is currently a little less than 0x1 (which is
the number of CPU ASIDs).

But quotas are only necessary for VMs, when the host shares the PASID
space with them (which isn't a use-case for Arm systems as far as I know,
each VM gets its own PASID space). Could we have quota-free IOASID sets
for the host?

For the SMMU I'd like to allocate two sets, one SVA and one private for
auxiliary domains, and I don't think giving either a quota makes much
sense at the moment. There can be systems using only SVA and systems using
only private PASIDs. I think it should be first-come-first-served until
admins want a knob to define a policy themselves, based on cgroups for
example.

> Signed-off-by: Jacob Pan 
> ---
>  drivers/iommu/intel-iommu.c | 8 +++-
>  drivers/iommu/ioasid.c  | 9 +
>  include/linux/ioasid.h  | 9 +
>  3 files changed, 25 insertions(+), 1 deletion(-)
> 
> diff --git a/drivers/iommu/intel-iommu.c b/drivers/iommu/intel-iommu.c
> index ec3fc121744a..af7a1ef7b31e 100644
> --- a/drivers/iommu/intel-iommu.c
> +++ b/drivers/iommu/intel-iommu.c
> @@ -3511,8 +3511,14 @@ static int __init init_dmars(void)
>   goto free_iommu;
>  
>   /* PASID is needed for scalable mode irrespective to SVM */
> - if (intel_iommu_sm)
> + if (intel_iommu_sm) {
>   ioasid_install_capacity(intel_pasid_max_id);
> + /* We should not run out of IOASIDs at boot */
> + if (ioasid_alloc_system_set(PID_MAX_DEFAULT)) {
> + pr_err("Failed to enable host PASID allocator\n");
> + intel_iommu_sm = 0;
> + }
> + }
>  
>   /*
>* for each drhd
> diff --git a/drivers/iommu/ioasid.c b/drivers/iommu/ioasid.c
> index 6265d2dbbced..9135af171a7c 100644
> --- a/drivers/iommu/ioasid.c
> +++ b/drivers/iommu/ioasid.c
> @@ -39,6 +39,9 @@ struct ioasid_data {
>  static ioasid_t ioasid_capacity;
>  static ioasid_t ioasid_capacity_avail;
>  
> +int system_ioasid_sid;
> +static DECLARE_IOASID_SET(system_ioasid);
> +
>  /* System capacity can only be set once */
>  void ioasid_install_capacity(ioasid_t total)
>  {
> @@ -51,6 +54,12 @@ void ioasid_install_capacity(ioasid_t total)
>  }
>  EXPORT_SYMBOL_GPL(ioasid_install_capacity);
>  
> +int ioasid_alloc_system_set(int quota)
> +{
> + return ioasid_alloc_set(_ioasid, quota, _ioasid_sid);
> +}
> +EXPORT_SYMBOL_GPL(ioasid_alloc_system_set);

I think this helper could stay in the VT-d driver for the moment. If the
SMMU driver ever implements auxiliary domains it will use a private IOASID
set, separate from the shared IOASID set managed by iommu-sva. Both could
qualify as "system set".

Thanks,
Jean

> +
>  /*
>   * struct ioasid_allocator_data - Internal data structure to hold information
>   * about an allocator. There are two types of allocators:
> diff --git a/include/linux/ioasid.h b/include/linux/ioasid.h
> index 8c82d2625671..097b1cc043a3 100644
> --- a/include/linux/ioasid.h
> +++ b/include/linux/ioasid.h
> @@ -29,6 +29,9 @@ struct ioasid_allocator_ops {
>   void *pdata;
>  };
>  
> +/* Shared IOASID set for reserved for host system use */
> +extern int system_ioasid_sid;
> +
>  #define DECLARE_IOASID_SET(name) struct ioasid_set name = { 0 }
>  
>  #if IS_ENABLED(CONFIG_IOASID)
> @@ -41,6 +44,7 @@ int ioasid_register_allocator(struct ioasid_allocator_ops 
> *allocator);
>  void ioasid_unregister_allocator(struct ioasid_allocator_ops *allocator);
>  int ioasid_attach_data(ioasid_t ioasid, void *data);
>  void ioasid_install_capacity(ioasid_t total);
> +int ioasid_alloc_system_set(int quota);
>  int ioasid_alloc_set(struct ioasid_set *token, ioasid_t quota, int *sid);
>  void ioasid_free_set(int sid, bool destroy_set);
>  int ioasid_find_sid(ioasid_t ioasid);
> @@ -88,5 +92,10 @@ static inline void ioasid_install_capacity(ioasid_t total)
>  {
>  }
>  
> +static inline int ioasid_alloc_system_set(int quota)
> +{
> + return -ENOTSUPP;
> +}
> +
>  #endif /* CONFIG_IOASID */
>  #endif /* __LINUX_IOASID_H */
> -- 
> 2.7.4
> 
___
iommu mailing list
iommu@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/iommu


Re: [PATCH 03/10] iommu/ioasid: Introduce per set allocation APIs

2020-04-01 Thread Jean-Philippe Brucker
On Wed, Mar 25, 2020 at 10:55:24AM -0700, Jacob Pan wrote:
> IOASID set defines a group of IDs that share the same token. The
> ioasid_set concept helps to do permission checking among users as in the
> current code.
> 
> With guest SVA usage, each VM has its own IOASID set. More
> functionalities are needed:
> 1. Enforce quota, each guest may be assigned limited quota such that one
> guest cannot abuse all the system resource.
> 2. Stores IOASID mapping between guest and host IOASIDs
> 3. Per set operations, e.g. free the entire set
> 
> For each ioasid_set token, a unique set ID is assigned. This makes
> reference of the set and data lookup much easier to implement.
> 
> Signed-off-by: Liu Yi L 
> Signed-off-by: Jacob Pan 
> ---
>  drivers/iommu/ioasid.c | 147 
> +
>  include/linux/ioasid.h |  13 +
>  2 files changed, 160 insertions(+)
> 
> diff --git a/drivers/iommu/ioasid.c b/drivers/iommu/ioasid.c
> index 4026e52855b9..27ee57f7079b 100644
> --- a/drivers/iommu/ioasid.c
> +++ b/drivers/iommu/ioasid.c
> @@ -10,6 +10,25 @@
>  #include 
>  #include 
>  
> +static DEFINE_XARRAY_ALLOC(ioasid_sets);
> +/**
> + * struct ioasid_set_data - Meta data about ioasid_set
> + *
> + * @token:   Unique to identify an IOASID set
> + * @xa:  XArray to store subset ID and IOASID mapping
> + * @size:Max number of IOASIDs can be allocated within the set
> + * @nr_ioasids   Number of IOASIDs allocated in the set
> + * @sid  ID of the set
> + */
> +struct ioasid_set_data {
> + struct ioasid_set *token;
> + struct xarray xa;
> + int size;
> + int nr_ioasids;
> + int sid;
> + struct rcu_head rcu;
> +};
> +
>  struct ioasid_data {
>   ioasid_t id;
>   struct ioasid_set *set;
> @@ -388,6 +407,111 @@ void ioasid_free(ioasid_t ioasid)
>  EXPORT_SYMBOL_GPL(ioasid_free);
>  
>  /**
> + * ioasid_alloc_set - Allocate a set of IOASIDs
> + * @token:   Unique token of the IOASID set
> + * @quota:   Quota allowed in this set
> + * @sid: IOASID set ID to be assigned
> + *
> + * Return 0 upon success. Token will be stored internally for lookup,
> + * IOASID allocation within the set and other per set operations will use
> + * the @sid assigned.
> + *
> + */
> +int ioasid_alloc_set(struct ioasid_set *token, ioasid_t quota, int *sid)
> +{
> + struct ioasid_set_data *sdata;
> + ioasid_t id;
> + int ret = 0;
> +
> + if (quota > ioasid_capacity_avail) {
> + pr_warn("Out of IOASID capacity! ask %d, avail %d\n",
> + quota, ioasid_capacity_avail);
> + return -ENOSPC;
> + }

This check should be in the same critical section as the quota
substraction

> +
> + sdata = kzalloc(sizeof(*sdata), GFP_KERNEL);
> + if (!sdata)
> + return -ENOMEM;

I don't understand why we need this structure at all, nor why we need the
SID. Users have already allocated an ioasid_set, so why not just stick the
content of ioasid_set_data in there, and pass the ioasid_set pointer to
ioasid_alloc()?

> +
> + spin_lock(_allocator_lock);
> +
> + ret = xa_alloc(_sets, , sdata,
> +XA_LIMIT(0, ioasid_capacity_avail - quota),
> +GFP_KERNEL);

Same as Kevin, I think the limit should be the static ioasid_capacity. And
perhaps a comment explaining the worst case of one PASID per set. I found
a little confusing using the same type ioasid_t for IOASIDs and IOASID
sets, it may be clearer to use an int for IOASID set IDs.

Thanks,
Jean

> + if (ret) {
> + kfree(sdata);
> + goto error;
> + }
> +
> + sdata->token = token;
> + sdata->size = quota;
> + sdata->sid = id;
> +
> + /*
> +  * Set Xarray is used to store IDs within the set, get ready for
> +  * sub-set ID and system-wide IOASID allocation results.
> +  */
> + xa_init_flags(>xa, XA_FLAGS_ALLOC);
> +
> + ioasid_capacity_avail -= quota;
> + *sid = id;
> +
> +error:
> + spin_unlock(_allocator_lock);
> +
> + return ret;
> +}
> +EXPORT_SYMBOL_GPL(ioasid_alloc_set);
> +
> +/**
> + * ioasid_free_set - Free all IOASIDs within the set
> + *
> + * @sid: The IOASID set ID to be freed
> + * @destroy_set: Whether to keep the set for further allocation.
> + *   If true, the set will be destroyed.
> + *
> + * All IOASIDs allocated within the set will be freed upon return.
> + */
> +void ioasid_free_set(int sid, bool destroy_set)
> +{
> + struct ioasid_set_data *sdata;
> + struct ioasid_data *entry;
> + unsigned long index;
> +
> + spin_lock(_allocator_lock);
> + sdata = xa_load(_sets, sid);
> + if (!sdata) {
> + pr_err("No IOASID set found to free %d\n", sid);
> + goto done_unlock;
> + }
> +
> + if (xa_empty(>xa)) {
> + pr_warn("No IOASIDs in the set %d\n", sdata->sid);
> + goto done_destroy;
> + }
> +
> + 

Re: [PATCH 01/10] iommu/ioasid: Introduce system-wide capacity

2020-04-01 Thread Jean-Philippe Brucker
On Wed, Mar 25, 2020 at 10:55:22AM -0700, Jacob Pan wrote:
> IOASID is a limited system-wide resource that can be allocated at
> runtime. This limitation can be enumerated during boot. For example, on
> x86 platforms, PCI Process Address Space ID (PASID) allocation uses
> IOASID service. The number of supported PASID bits are enumerated by
> extended capability register as defined in the VT-d spec.
> 
> This patch adds a helper to set the system capacity, it expected to be
> set during boot prior to any allocation request.

This one-time setting is a bit awkward. Since multiple IOMMU drivers may
be loaded, this can't be a module_init() thing. And we generally have
multiple SMMU instances in the system. So we'd need to call
install_capacity() only for the first SMMU loaded with an arbitrary 1<<20,
even though each SMMU can support different numbers of PASID bits.
Furthermore, modules such as iommu-sva will want to initialize their
IOASID set at module_init(), which will happen before the SMMU can set up
the capacity, so ioasid_alloc_set() will return an error.

How about hardcoding ioasid_capacity to 20 bits for now?  It's the PCIe
limit and probably won't have to increase for a while.

Thanks,
Jean

> 
> Signed-off-by: Jacob Pan 
> ---
>  drivers/iommu/ioasid.c | 15 +++
>  include/linux/ioasid.h |  5 -
>  2 files changed, 19 insertions(+), 1 deletion(-)
> 
> diff --git a/drivers/iommu/ioasid.c b/drivers/iommu/ioasid.c
> index 0f8dd377aada..4026e52855b9 100644
> --- a/drivers/iommu/ioasid.c
> +++ b/drivers/iommu/ioasid.c
> @@ -17,6 +17,21 @@ struct ioasid_data {
>   struct rcu_head rcu;
>  };
>  
> +static ioasid_t ioasid_capacity;
> +static ioasid_t ioasid_capacity_avail;
> +
> +/* System capacity can only be set once */
> +void ioasid_install_capacity(ioasid_t total)
> +{
> + if (ioasid_capacity) {
> + pr_warn("IOASID capacity already set at %d\n", ioasid_capacity);
> + return;
> + }
> +
> + ioasid_capacity = ioasid_capacity_avail = total;
> +}
> +EXPORT_SYMBOL_GPL(ioasid_install_capacity);
> +
>  /*
>   * struct ioasid_allocator_data - Internal data structure to hold information
>   * about an allocator. There are two types of allocators:
> diff --git a/include/linux/ioasid.h b/include/linux/ioasid.h
> index 6f000d7a0ddc..9711fa0dc357 100644
> --- a/include/linux/ioasid.h
> +++ b/include/linux/ioasid.h
> @@ -40,7 +40,7 @@ void *ioasid_find(struct ioasid_set *set, ioasid_t ioasid,
>  int ioasid_register_allocator(struct ioasid_allocator_ops *allocator);
>  void ioasid_unregister_allocator(struct ioasid_allocator_ops *allocator);
>  int ioasid_set_data(ioasid_t ioasid, void *data);
> -
> +void ioasid_install_capacity(ioasid_t total);
>  #else /* !CONFIG_IOASID */
>  static inline ioasid_t ioasid_alloc(struct ioasid_set *set, ioasid_t min,
>   ioasid_t max, void *private)
> @@ -72,5 +72,8 @@ static inline int ioasid_set_data(ioasid_t ioasid, void 
> *data)
>   return -ENOTSUPP;
>  }
>  
> +static inline void ioasid_install_capacity(ioasid_t total)
> +{
> +}
>  #endif /* CONFIG_IOASID */
>  #endif /* __LINUX_IOASID_H */
> -- 
> 2.7.4
> 
___
iommu mailing list
iommu@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/iommu


Re: [PATCH v10 04/11] vfio/pci: Add VFIO_REGION_TYPE_NESTED region type

2020-04-01 Thread Auger Eric
Hi Yi,

On 4/1/20 3:18 PM, Liu, Yi L wrote:
> Hi Eric,
> 
> Just curious about your plan on this patch, I just heard my colleague would 
> like
> to reference the functions from this patch in his dsa driver work.

Well I intend to respin until somebody tells me it is completely vain or
dead follows. Joking aside, feel free to embed it in any series it would
be beneficial to, just please cc me in case code diverges.

Thanks

Eric
> 
> Regards,
> Yi Liu
> 
>> From: Eric Auger 
>> Sent: Saturday, March 21, 2020 12:19 AM
>> To: eric.auger@gmail.com; eric.au...@redhat.com; iommu@lists.linux-
>> foundation.org; linux-ker...@vger.kernel.org; k...@vger.kernel.org;
>> kvm...@lists.cs.columbia.edu; j...@8bytes.org; alex.william...@redhat.com;
>> jacob.jun@linux.intel.com; Liu, Yi L ; jean-
>> philippe.bruc...@arm.com; will.dea...@arm.com; robin.mur...@arm.com
>> Cc: marc.zyng...@arm.com; peter.mayd...@linaro.org; zhangfei@gmail.com
>> Subject: [PATCH v10 04/11] vfio/pci: Add VFIO_REGION_TYPE_NESTED region type
>>
>> Add a new specific DMA_FAULT region aiming to exposed nested mode
>> translation faults.
>>
>> The region has a ring buffer that contains the actual fault
>> records plus a header allowing to handle it (tail/head indices,
>> max capacity, entry size). At the moment the region is dimensionned
>> for 512 fault records.
>>
>> Signed-off-by: Eric Auger 
>>
>> ---
>>
>> v8 -> v9:
>> - Use a single region instead of a prod/cons region
>>
>> v4 -> v5
>> - check cons is not null in vfio_pci_check_cons_fault
>>
>> v3 -> v4:
>> - use 2 separate regions, respectively in read and write modes
>> - add the version capability
>> ---
>>  drivers/vfio/pci/vfio_pci.c | 68 +
>>  drivers/vfio/pci/vfio_pci_private.h | 10 +
>>  drivers/vfio/pci/vfio_pci_rdwr.c| 45 +++
>>  include/uapi/linux/vfio.h   | 35 +++
>>  4 files changed, 158 insertions(+)
>>
>> diff --git a/drivers/vfio/pci/vfio_pci.c b/drivers/vfio/pci/vfio_pci.c
>> index 379a02c36e37..586b89debed5 100644
>> --- a/drivers/vfio/pci/vfio_pci.c
>> +++ b/drivers/vfio/pci/vfio_pci.c
>> @@ -260,6 +260,69 @@ int vfio_pci_set_power_state(struct vfio_pci_device 
>> *vdev,
>> pci_power_t state)
>>  return ret;
>>  }
>>
>> +static void vfio_pci_dma_fault_release(struct vfio_pci_device *vdev,
>> +   struct vfio_pci_region *region)
>> +{
>> +}
>> +
>> +static int vfio_pci_dma_fault_add_capability(struct vfio_pci_device *vdev,
>> + struct vfio_pci_region *region,
>> + struct vfio_info_cap *caps)
>> +{
>> +struct vfio_region_info_cap_fault cap = {
>> +.header.id = VFIO_REGION_INFO_CAP_DMA_FAULT,
>> +.header.version = 1,
>> +.version = 1,
>> +};
>> +return vfio_info_add_capability(caps, , sizeof(cap));
>> +}
>> +
>> +static const struct vfio_pci_regops vfio_pci_dma_fault_regops = {
>> +.rw = vfio_pci_dma_fault_rw,
>> +.release= vfio_pci_dma_fault_release,
>> +.add_capability = vfio_pci_dma_fault_add_capability,
>> +};
>> +
>> +#define DMA_FAULT_RING_LENGTH 512
>> +
>> +static int vfio_pci_init_dma_fault_region(struct vfio_pci_device *vdev)
>> +{
>> +struct vfio_region_dma_fault *header;
>> +size_t size;
>> +int ret;
>> +
>> +mutex_init(>fault_queue_lock);
>> +
>> +/*
>> + * We provision 1 page for the header and space for
>> + * DMA_FAULT_RING_LENGTH fault records in the ring buffer.
>> + */
>> +size = ALIGN(sizeof(struct iommu_fault) *
>> + DMA_FAULT_RING_LENGTH, PAGE_SIZE) + PAGE_SIZE;
>> +
>> +vdev->fault_pages = kzalloc(size, GFP_KERNEL);
>> +if (!vdev->fault_pages)
>> +return -ENOMEM;
>> +
>> +ret = vfio_pci_register_dev_region(vdev,
>> +VFIO_REGION_TYPE_NESTED,
>> +VFIO_REGION_SUBTYPE_NESTED_DMA_FAULT,
>> +_pci_dma_fault_regops, size,
>> +VFIO_REGION_INFO_FLAG_READ |
>> VFIO_REGION_INFO_FLAG_WRITE,
>> +vdev->fault_pages);
>> +if (ret)
>> +goto out;
>> +
>> +header = (struct vfio_region_dma_fault *)vdev->fault_pages;
>> +header->entry_size = sizeof(struct iommu_fault);
>> +header->nb_entries = DMA_FAULT_RING_LENGTH;
>> +header->offset = sizeof(struct vfio_region_dma_fault);
>> +return 0;
>> +out:
>> +kfree(vdev->fault_pages);
>> +return ret;
>> +}
>> +
>>  static int vfio_pci_enable(struct vfio_pci_device *vdev)
>>  {
>>  struct pci_dev *pdev = vdev->pdev;
>> @@ -358,6 +421,10 @@ static int vfio_pci_enable(struct vfio_pci_device *vdev)
>>  }
>>  }
>>
>> +ret = vfio_pci_init_dma_fault_region(vdev);
>> +if (ret)
>> +goto disable_exit;
>> +
>>  vfio_pci_probe_mmaps(vdev);
>>
>>  return 0;
>> @@ -1383,6 +1450,7 @@ static void vfio_pci_remove(struct pci_dev *pdev)
>>
>>  

RE: [PATCH v10 04/11] vfio/pci: Add VFIO_REGION_TYPE_NESTED region type

2020-04-01 Thread Liu, Yi L
Hi Eric,

Just curious about your plan on this patch, I just heard my colleague would like
to reference the functions from this patch in his dsa driver work.

Regards,
Yi Liu

> From: Eric Auger 
> Sent: Saturday, March 21, 2020 12:19 AM
> To: eric.auger@gmail.com; eric.au...@redhat.com; iommu@lists.linux-
> foundation.org; linux-ker...@vger.kernel.org; k...@vger.kernel.org;
> kvm...@lists.cs.columbia.edu; j...@8bytes.org; alex.william...@redhat.com;
> jacob.jun@linux.intel.com; Liu, Yi L ; jean-
> philippe.bruc...@arm.com; will.dea...@arm.com; robin.mur...@arm.com
> Cc: marc.zyng...@arm.com; peter.mayd...@linaro.org; zhangfei@gmail.com
> Subject: [PATCH v10 04/11] vfio/pci: Add VFIO_REGION_TYPE_NESTED region type
> 
> Add a new specific DMA_FAULT region aiming to exposed nested mode
> translation faults.
> 
> The region has a ring buffer that contains the actual fault
> records plus a header allowing to handle it (tail/head indices,
> max capacity, entry size). At the moment the region is dimensionned
> for 512 fault records.
> 
> Signed-off-by: Eric Auger 
> 
> ---
> 
> v8 -> v9:
> - Use a single region instead of a prod/cons region
> 
> v4 -> v5
> - check cons is not null in vfio_pci_check_cons_fault
> 
> v3 -> v4:
> - use 2 separate regions, respectively in read and write modes
> - add the version capability
> ---
>  drivers/vfio/pci/vfio_pci.c | 68 +
>  drivers/vfio/pci/vfio_pci_private.h | 10 +
>  drivers/vfio/pci/vfio_pci_rdwr.c| 45 +++
>  include/uapi/linux/vfio.h   | 35 +++
>  4 files changed, 158 insertions(+)
> 
> diff --git a/drivers/vfio/pci/vfio_pci.c b/drivers/vfio/pci/vfio_pci.c
> index 379a02c36e37..586b89debed5 100644
> --- a/drivers/vfio/pci/vfio_pci.c
> +++ b/drivers/vfio/pci/vfio_pci.c
> @@ -260,6 +260,69 @@ int vfio_pci_set_power_state(struct vfio_pci_device 
> *vdev,
> pci_power_t state)
>   return ret;
>  }
> 
> +static void vfio_pci_dma_fault_release(struct vfio_pci_device *vdev,
> +struct vfio_pci_region *region)
> +{
> +}
> +
> +static int vfio_pci_dma_fault_add_capability(struct vfio_pci_device *vdev,
> +  struct vfio_pci_region *region,
> +  struct vfio_info_cap *caps)
> +{
> + struct vfio_region_info_cap_fault cap = {
> + .header.id = VFIO_REGION_INFO_CAP_DMA_FAULT,
> + .header.version = 1,
> + .version = 1,
> + };
> + return vfio_info_add_capability(caps, , sizeof(cap));
> +}
> +
> +static const struct vfio_pci_regops vfio_pci_dma_fault_regops = {
> + .rw = vfio_pci_dma_fault_rw,
> + .release= vfio_pci_dma_fault_release,
> + .add_capability = vfio_pci_dma_fault_add_capability,
> +};
> +
> +#define DMA_FAULT_RING_LENGTH 512
> +
> +static int vfio_pci_init_dma_fault_region(struct vfio_pci_device *vdev)
> +{
> + struct vfio_region_dma_fault *header;
> + size_t size;
> + int ret;
> +
> + mutex_init(>fault_queue_lock);
> +
> + /*
> +  * We provision 1 page for the header and space for
> +  * DMA_FAULT_RING_LENGTH fault records in the ring buffer.
> +  */
> + size = ALIGN(sizeof(struct iommu_fault) *
> +  DMA_FAULT_RING_LENGTH, PAGE_SIZE) + PAGE_SIZE;
> +
> + vdev->fault_pages = kzalloc(size, GFP_KERNEL);
> + if (!vdev->fault_pages)
> + return -ENOMEM;
> +
> + ret = vfio_pci_register_dev_region(vdev,
> + VFIO_REGION_TYPE_NESTED,
> + VFIO_REGION_SUBTYPE_NESTED_DMA_FAULT,
> + _pci_dma_fault_regops, size,
> + VFIO_REGION_INFO_FLAG_READ |
> VFIO_REGION_INFO_FLAG_WRITE,
> + vdev->fault_pages);
> + if (ret)
> + goto out;
> +
> + header = (struct vfio_region_dma_fault *)vdev->fault_pages;
> + header->entry_size = sizeof(struct iommu_fault);
> + header->nb_entries = DMA_FAULT_RING_LENGTH;
> + header->offset = sizeof(struct vfio_region_dma_fault);
> + return 0;
> +out:
> + kfree(vdev->fault_pages);
> + return ret;
> +}
> +
>  static int vfio_pci_enable(struct vfio_pci_device *vdev)
>  {
>   struct pci_dev *pdev = vdev->pdev;
> @@ -358,6 +421,10 @@ static int vfio_pci_enable(struct vfio_pci_device *vdev)
>   }
>   }
> 
> + ret = vfio_pci_init_dma_fault_region(vdev);
> + if (ret)
> + goto disable_exit;
> +
>   vfio_pci_probe_mmaps(vdev);
> 
>   return 0;
> @@ -1383,6 +1450,7 @@ static void vfio_pci_remove(struct pci_dev *pdev)
> 
>   vfio_iommu_group_put(pdev->dev.iommu_group, >dev);
>   kfree(vdev->region);
> + kfree(vdev->fault_pages);
>   mutex_destroy(>ioeventfds_lock);
> 
>   if (!disable_idle_d3)
> diff --git a/drivers/vfio/pci/vfio_pci_private.h 
> b/drivers/vfio/pci/vfio_pci_private.h
> index 8a2c7607d513..a392f50e3a99 100644
> --- 

RE: [PATCH v1 3/8] vfio/type1: Report PASID alloc/free support to userspace

2020-04-01 Thread Liu, Yi L
Hi Eric,

> From: Auger Eric 
> Sent: Wednesday, April 1, 2020 5:41 PM
> To: Liu, Yi L ; alex.william...@redhat.com
> Subject: Re: [PATCH v1 3/8] vfio/type1: Report PASID alloc/free support to
> userspace
> 
> Yi,
> On 3/22/20 1:32 PM, Liu, Yi L wrote:
> > From: Liu Yi L 
> >
> > This patch reports PASID alloc/free availability to userspace (e.g.
> > QEMU) thus userspace could do a pre-check before utilizing this feature.
> >
> > Cc: Kevin Tian 
> > CC: Jacob Pan 
> > Cc: Alex Williamson 
> > Cc: Eric Auger 
> > Cc: Jean-Philippe Brucker 
> > Signed-off-by: Liu Yi L 
> > ---
> >  drivers/vfio/vfio_iommu_type1.c | 28 
> >  include/uapi/linux/vfio.h   |  8 
> >  2 files changed, 36 insertions(+)
> >
> > diff --git a/drivers/vfio/vfio_iommu_type1.c
> > b/drivers/vfio/vfio_iommu_type1.c index e40afc0..ddd1ffe 100644
> > --- a/drivers/vfio/vfio_iommu_type1.c
> > +++ b/drivers/vfio/vfio_iommu_type1.c
> > @@ -2234,6 +2234,30 @@ static int vfio_iommu_type1_pasid_free(struct
> vfio_iommu *iommu,
> > return ret;
> >  }
> >
> > +static int vfio_iommu_info_add_nesting_cap(struct vfio_iommu *iommu,
> > +struct vfio_info_cap *caps)
> > +{
> > +   struct vfio_info_cap_header *header;
> > +   struct vfio_iommu_type1_info_cap_nesting *nesting_cap;
> > +
> > +   header = vfio_info_cap_add(caps, sizeof(*nesting_cap),
> > +  VFIO_IOMMU_TYPE1_INFO_CAP_NESTING, 1);
> > +   if (IS_ERR(header))
> > +   return PTR_ERR(header);
> > +
> > +   nesting_cap = container_of(header,
> > +   struct vfio_iommu_type1_info_cap_nesting,
> > +   header);
> > +
> > +   nesting_cap->nesting_capabilities = 0;
> > +   if (iommu->nesting) {
> > +   /* nesting iommu type supports PASID requests (alloc/free) */
> > +   nesting_cap->nesting_capabilities |= VFIO_IOMMU_PASID_REQS;
> Supporting nesting does not necessarily mean supporting PASID.

here I think the PASID is somehow IDs in kernel which can be used to
tag various address spaces provided by guest software. I think this
is why we introduced the ioasid. :-) Current implementation is doing
PASID alloc/free in vfio.

> > +   }
> > +
> > +   return 0;
> > +}
> > +
> >  static long vfio_iommu_type1_ioctl(void *iommu_data,
> >unsigned int cmd, unsigned long arg)  { @@ -
> 2283,6 +2307,10 @@
> > static long vfio_iommu_type1_ioctl(void *iommu_data,
> > if (ret)
> > return ret;
> >
> > +   ret = vfio_iommu_info_add_nesting_cap(iommu, );
> > +   if (ret)
> > +   return ret;
> > +
> > if (caps.size) {
> > info.flags |= VFIO_IOMMU_INFO_CAPS;
> >
> > diff --git a/include/uapi/linux/vfio.h b/include/uapi/linux/vfio.h
> > index 298ac80..8837219 100644
> > --- a/include/uapi/linux/vfio.h
> > +++ b/include/uapi/linux/vfio.h
> > @@ -748,6 +748,14 @@ struct vfio_iommu_type1_info_cap_iova_range {
> > struct  vfio_iova_range iova_ranges[];
> >  };
> >
> > +#define VFIO_IOMMU_TYPE1_INFO_CAP_NESTING  2
> > +
> > +struct vfio_iommu_type1_info_cap_nesting {
> > +   struct  vfio_info_cap_header header;
> > +#define VFIO_IOMMU_PASID_REQS  (1 << 0)
> PASID_REQS sounds a bit far from the claimed host managed alloc/free
> capability.
> VFIO_IOMMU_SYSTEM_WIDE_PASID?

Oh, yep. I can rename it.

Regards,
Yi Liu
___
iommu mailing list
iommu@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/iommu


Re: [PATCH v1 5/8] vfio/type1: Report 1st-level/stage-1 format to userspace

2020-04-01 Thread Auger Eric
Hi Yi,

On 4/1/20 2:51 PM, Liu, Yi L wrote:
> Hi Eric,
> 
>> From: Auger Eric 
>> Sent: Wednesday, April 1, 2020 4:51 PM
>> To: Liu, Yi L ; alex.william...@redhat.com
>> Subject: Re: [PATCH v1 5/8] vfio/type1: Report 1st-level/stage-1 format to
>> userspace
>>
>> Hi Yi,
>> On 3/22/20 1:32 PM, Liu, Yi L wrote:
>>> From: Liu Yi L 
>>>
>>> VFIO exposes IOMMU nesting translation (a.k.a dual stage translation)
>>> capability to userspace. Thus applications like QEMU could support
>>> vIOMMU with hardware's nesting translation capability for pass-through
>>> devices. Before setting up nesting translation for pass-through devices,
>>> QEMU and other applications need to learn the supported 1st-lvl/stage-1
>>> translation structure format like page table format.
>>>
>>> Take vSVA (virtual Shared Virtual Addressing) as an example, to support
>>> vSVA for pass-through devices, QEMU setup nesting translation for pass-
>>> through devices. The guest page table are configured to host as 1st-lvl/
>>> stage-1 page table. Therefore, guest format should be compatible with
>>> host side.
>>>
>>> This patch reports the supported 1st-lvl/stage-1 page table format on the
>>> current platform to userspace. QEMU and other alike applications should
>>> use this format info when trying to setup IOMMU nesting translation on
>>> host IOMMU.
>>>
>>> Cc: Kevin Tian 
>>> CC: Jacob Pan 
>>> Cc: Alex Williamson 
>>> Cc: Eric Auger 
>>> Cc: Jean-Philippe Brucker 
>>> Signed-off-by: Liu Yi L 
>>> ---
>>>  drivers/vfio/vfio_iommu_type1.c | 56
>> +
>>>  include/uapi/linux/vfio.h   |  1 +
>>>  2 files changed, 57 insertions(+)
>>>
>>> diff --git a/drivers/vfio/vfio_iommu_type1.c 
>>> b/drivers/vfio/vfio_iommu_type1.c
>>> index 9aa2a67..82a9e0b 100644
>>> --- a/drivers/vfio/vfio_iommu_type1.c
>>> +++ b/drivers/vfio/vfio_iommu_type1.c
>>> @@ -2234,11 +2234,66 @@ static int vfio_iommu_type1_pasid_free(struct
>> vfio_iommu *iommu,
>>> return ret;
>>>  }
>>>
>>> +static int vfio_iommu_get_stage1_format(struct vfio_iommu *iommu,
>>> +u32 *stage1_format)
>> vfio_pasid_format() to be homogeneous with vfio_pgsize_bitmap() which
>> does the same kind of enumeration of the vfio_iommu domains
> 
> yes, similar.
> 
>>> +{
>>> +   struct vfio_domain *domain;
>>> +   u32 format = 0, tmp_format = 0;
>>> +   int ret;
>> ret = -EINVAL;
> 
> got it.
> 
>>> +
>>> +   mutex_lock(>lock);
>>> +   if (list_empty(>domain_list)) {
>> goto out_unlock;
> 
> right.
>>> +   mutex_unlock(>lock);
>>> +   return -EINVAL;
>>> +   }
>>> +
>>> +   list_for_each_entry(domain, >domain_list, next) {
>>> +   if (iommu_domain_get_attr(domain->domain,
>>> +   DOMAIN_ATTR_PASID_FORMAT, )) {
>> I can find DOMAIN_ATTR_PASID_FORMAT in Jacob's v9 but not in v10
> 
> oops, I guess he somehow missed. you may find it in below link.
> 
> https://github.com/luxis1999/linux-vsva/commit/bf14b11a12f74d58ad3ee626a5d891de395082eb
> 
>>> +   ret = -EINVAL;
>> could be removed
> 
> sure.
> 
>>> +   format = 0;
>>> +   goto out_unlock;
>>> +   }
>>> +   /*
>>> +* format is always non-zero (the first format is
>>> +* IOMMU_PASID_FORMAT_INTEL_VTD which is 1). For
>>> +* the reason of potential different backed IOMMU
>>> +* formats, here we expect to have identical formats
>>> +* in the domain list, no mixed formats support.
>>> +* return -EINVAL to fail the attempt of setup
>>> +* VFIO_TYPE1_NESTING_IOMMU if non-identical formats
>>> +* are detected.
>>> +*/
>>> +   if (tmp_format && tmp_format != format) {
>>> +   ret = -EINVAL;
>> could be removed
> 
> got it.
> 
>>> +   format = 0;
>>> +   goto out_unlock;
>>> +   }
>>> +
>>> +   tmp_format = format;
>>> +   }
>>> +   ret = 0;
>>> +
>>> +out_unlock:
>>> +   if (format)
>> if (!ret) ? then you can remove the format = 0 in case of error.
> 
> oh, yes.
> 
>>> +   *stage1_format = format;
>>> +   mutex_unlock(>lock);
>>> +   return ret;
>>> +}
>>> +
>>>  static int vfio_iommu_info_add_nesting_cap(struct vfio_iommu *iommu,
>>>  struct vfio_info_cap *caps)
>>>  {
>>> struct vfio_info_cap_header *header;
>>> struct vfio_iommu_type1_info_cap_nesting *nesting_cap;
>>> +   u32 formats = 0;
>>> +   int ret;
>>> +
>>> +   ret = vfio_iommu_get_stage1_format(iommu, );
>>> +   if (ret) {
>>> +   pr_warn("Failed to get stage-1 format\n");
>> trace triggered by userspace to be removed?
> 
> sure.
> 
>>> +   return ret;
>>> +   }
>>>
>>> header = vfio_info_cap_add(caps, sizeof(*nesting_cap),
>>>VFIO_IOMMU_TYPE1_INFO_CAP_NESTING, 1);
>>> @@ -2254,6 +2309,7 @@ static int 

Re: [RFC PATCH v2] iommu/virtio: Use page size bitmap supported by endpoint

2020-04-01 Thread Robin Murphy

On 2020-04-01 12:38 pm, Bharat Bhushan wrote:

Different endpoint can support different page size, probe
endpoint if it supports specific page size otherwise use
global page sizes.

Signed-off-by: Bharat Bhushan 
---
  drivers/iommu/virtio-iommu.c  | 33 +++
  include/uapi/linux/virtio_iommu.h |  7 +++
  2 files changed, 36 insertions(+), 4 deletions(-)

diff --git a/drivers/iommu/virtio-iommu.c b/drivers/iommu/virtio-iommu.c
index cce329d71fba..c794cb5b7b3e 100644
--- a/drivers/iommu/virtio-iommu.c
+++ b/drivers/iommu/virtio-iommu.c
@@ -78,6 +78,7 @@ struct viommu_endpoint {
struct viommu_dev   *viommu;
struct viommu_domain*vdomain;
struct list_headresv_regions;
+   u64 pgsize_bitmap;
  };
  
  struct viommu_request {

@@ -415,6 +416,20 @@ static int viommu_replay_mappings(struct viommu_domain 
*vdomain)
return ret;
  }
  
+static int viommu_set_pgsize_bitmap(struct viommu_endpoint *vdev,

+   struct virtio_iommu_probe_pgsize_mask *mask,
+   size_t len)
+
+{
+   u64 pgsize_bitmap = le64_to_cpu(mask->pgsize_bitmap);
+
+   if (len < sizeof(*mask))
+   return -EINVAL;
+
+   vdev->pgsize_bitmap = pgsize_bitmap;
+   return 0;
+}
+
  static int viommu_add_resv_mem(struct viommu_endpoint *vdev,
   struct virtio_iommu_probe_resv_mem *mem,
   size_t len)
@@ -494,11 +509,13 @@ static int viommu_probe_endpoint(struct viommu_dev 
*viommu, struct device *dev)
while (type != VIRTIO_IOMMU_PROBE_T_NONE &&
   cur < viommu->probe_size) {
len = le16_to_cpu(prop->length) + sizeof(*prop);
-
switch (type) {
case VIRTIO_IOMMU_PROBE_T_RESV_MEM:
ret = viommu_add_resv_mem(vdev, (void *)prop, len);
break;
+   case VIRTIO_IOMMU_PROBE_T_PAGE_SIZE_MASK:
+   ret = viommu_set_pgsize_bitmap(vdev, (void *)prop, len);
+   break;
default:
dev_err(dev, "unknown viommu prop 0x%x\n", type);
}
@@ -607,16 +624,23 @@ static struct iommu_domain *viommu_domain_alloc(unsigned 
type)
return >domain;
  }
  
-static int viommu_domain_finalise(struct viommu_dev *viommu,

+static int viommu_domain_finalise(struct viommu_endpoint *vdev,
  struct iommu_domain *domain)
  {
int ret;
struct viommu_domain *vdomain = to_viommu_domain(domain);
+   struct viommu_dev *viommu = vdev->viommu;
  
  	vdomain->viommu		= viommu;

vdomain->map_flags   = viommu->map_flags;
  
-	domain->pgsize_bitmap	= viommu->pgsize_bitmap;

+   /* Devices in same domain must support same size pages */


AFAICS what the code appears to do is enforce that the first endpoint 
attached to any domain has the same pgsize_bitmap as the most recently 
probed viommu_dev instance, then ignore any subsequent endpoints 
attached to the same domain. Thus I'm not sure that comment is accurate.


Robin.


+   if ((domain->pgsize_bitmap != viommu->pgsize_bitmap) &&
+   (domain->pgsize_bitmap != vdev->pgsize_bitmap))
+   return -EINVAL;
+
+   domain->pgsize_bitmap = vdev->pgsize_bitmap;
+
domain->geometry = viommu->geometry;
  
  	ret = ida_alloc_range(>domain_ids, viommu->first_domain,

@@ -657,7 +681,7 @@ static int viommu_attach_dev(struct iommu_domain *domain, 
struct device *dev)
 * Properly initialize the domain now that we know which viommu
 * owns it.
 */
-   ret = viommu_domain_finalise(vdev->viommu, domain);
+   ret = viommu_domain_finalise(vdev, domain);
} else if (vdomain->viommu != vdev->viommu) {
dev_err(dev, "cannot attach to foreign vIOMMU\n");
ret = -EXDEV;
@@ -875,6 +899,7 @@ static int viommu_add_device(struct device *dev)
  
  	vdev->dev = dev;

vdev->viommu = viommu;
+   vdev->pgsize_bitmap = viommu->pgsize_bitmap;
INIT_LIST_HEAD(>resv_regions);
fwspec->iommu_priv = vdev;
  
diff --git a/include/uapi/linux/virtio_iommu.h b/include/uapi/linux/virtio_iommu.h

index 237e36a280cb..dc9d3f40bcd8 100644
--- a/include/uapi/linux/virtio_iommu.h
+++ b/include/uapi/linux/virtio_iommu.h
@@ -111,6 +111,7 @@ struct virtio_iommu_req_unmap {
  
  #define VIRTIO_IOMMU_PROBE_T_NONE		0

  #define VIRTIO_IOMMU_PROBE_T_RESV_MEM 1
+#define VIRTIO_IOMMU_PROBE_T_PAGE_SIZE_MASK2
  
  #define VIRTIO_IOMMU_PROBE_T_MASK		0xfff
  
@@ -119,6 +120,12 @@ struct virtio_iommu_probe_property {

__le16  length;
  };
  
+struct virtio_iommu_probe_pgsize_mask {

+   struct virtio_iommu_probe_property  head;
+

RE: [PATCH v1 5/8] vfio/type1: Report 1st-level/stage-1 format to userspace

2020-04-01 Thread Liu, Yi L
Hi Eric,

> From: Auger Eric 
> Sent: Wednesday, April 1, 2020 4:51 PM
> To: Liu, Yi L ; alex.william...@redhat.com
> Subject: Re: [PATCH v1 5/8] vfio/type1: Report 1st-level/stage-1 format to
> userspace
> 
> Hi Yi,
> On 3/22/20 1:32 PM, Liu, Yi L wrote:
> > From: Liu Yi L 
> >
> > VFIO exposes IOMMU nesting translation (a.k.a dual stage translation)
> > capability to userspace. Thus applications like QEMU could support
> > vIOMMU with hardware's nesting translation capability for pass-through
> > devices. Before setting up nesting translation for pass-through devices,
> > QEMU and other applications need to learn the supported 1st-lvl/stage-1
> > translation structure format like page table format.
> >
> > Take vSVA (virtual Shared Virtual Addressing) as an example, to support
> > vSVA for pass-through devices, QEMU setup nesting translation for pass-
> > through devices. The guest page table are configured to host as 1st-lvl/
> > stage-1 page table. Therefore, guest format should be compatible with
> > host side.
> >
> > This patch reports the supported 1st-lvl/stage-1 page table format on the
> > current platform to userspace. QEMU and other alike applications should
> > use this format info when trying to setup IOMMU nesting translation on
> > host IOMMU.
> >
> > Cc: Kevin Tian 
> > CC: Jacob Pan 
> > Cc: Alex Williamson 
> > Cc: Eric Auger 
> > Cc: Jean-Philippe Brucker 
> > Signed-off-by: Liu Yi L 
> > ---
> >  drivers/vfio/vfio_iommu_type1.c | 56
> +
> >  include/uapi/linux/vfio.h   |  1 +
> >  2 files changed, 57 insertions(+)
> >
> > diff --git a/drivers/vfio/vfio_iommu_type1.c 
> > b/drivers/vfio/vfio_iommu_type1.c
> > index 9aa2a67..82a9e0b 100644
> > --- a/drivers/vfio/vfio_iommu_type1.c
> > +++ b/drivers/vfio/vfio_iommu_type1.c
> > @@ -2234,11 +2234,66 @@ static int vfio_iommu_type1_pasid_free(struct
> vfio_iommu *iommu,
> > return ret;
> >  }
> >
> > +static int vfio_iommu_get_stage1_format(struct vfio_iommu *iommu,
> > +u32 *stage1_format)
> vfio_pasid_format() to be homogeneous with vfio_pgsize_bitmap() which
> does the same kind of enumeration of the vfio_iommu domains

yes, similar.

> > +{
> > +   struct vfio_domain *domain;
> > +   u32 format = 0, tmp_format = 0;
> > +   int ret;
> ret = -EINVAL;

got it.

> > +
> > +   mutex_lock(>lock);
> > +   if (list_empty(>domain_list)) {
> goto out_unlock;

right.
> > +   mutex_unlock(>lock);
> > +   return -EINVAL;
> > +   }
> > +
> > +   list_for_each_entry(domain, >domain_list, next) {
> > +   if (iommu_domain_get_attr(domain->domain,
> > +   DOMAIN_ATTR_PASID_FORMAT, )) {
> I can find DOMAIN_ATTR_PASID_FORMAT in Jacob's v9 but not in v10

oops, I guess he somehow missed. you may find it in below link.

https://github.com/luxis1999/linux-vsva/commit/bf14b11a12f74d58ad3ee626a5d891de395082eb

> > +   ret = -EINVAL;
> could be removed

sure.

> > +   format = 0;
> > +   goto out_unlock;
> > +   }
> > +   /*
> > +* format is always non-zero (the first format is
> > +* IOMMU_PASID_FORMAT_INTEL_VTD which is 1). For
> > +* the reason of potential different backed IOMMU
> > +* formats, here we expect to have identical formats
> > +* in the domain list, no mixed formats support.
> > +* return -EINVAL to fail the attempt of setup
> > +* VFIO_TYPE1_NESTING_IOMMU if non-identical formats
> > +* are detected.
> > +*/
> > +   if (tmp_format && tmp_format != format) {
> > +   ret = -EINVAL;
> could be removed

got it.

> > +   format = 0;
> > +   goto out_unlock;
> > +   }
> > +
> > +   tmp_format = format;
> > +   }
> > +   ret = 0;
> > +
> > +out_unlock:
> > +   if (format)
> if (!ret) ? then you can remove the format = 0 in case of error.

oh, yes.

> > +   *stage1_format = format;
> > +   mutex_unlock(>lock);
> > +   return ret;
> > +}
> > +
> >  static int vfio_iommu_info_add_nesting_cap(struct vfio_iommu *iommu,
> >  struct vfio_info_cap *caps)
> >  {
> > struct vfio_info_cap_header *header;
> > struct vfio_iommu_type1_info_cap_nesting *nesting_cap;
> > +   u32 formats = 0;
> > +   int ret;
> > +
> > +   ret = vfio_iommu_get_stage1_format(iommu, );
> > +   if (ret) {
> > +   pr_warn("Failed to get stage-1 format\n");
> trace triggered by userspace to be removed?

sure.

> > +   return ret;
> > +   }
> >
> > header = vfio_info_cap_add(caps, sizeof(*nesting_cap),
> >VFIO_IOMMU_TYPE1_INFO_CAP_NESTING, 1);
> > @@ -2254,6 +2309,7 @@ static int vfio_iommu_info_add_nesting_cap(struct
> vfio_iommu *iommu,
> > /* nesting iommu type supports PASID requests (alloc/free) */
> >   

Re: [PATCH v2 2/4] iommu: Add Allwinner H6 IOMMU driver

2020-04-01 Thread Maxime Ripard
Hi Jörg,

Thanks for your review, I'll fix the issues you pointed out and I left
out.

On Mon, Mar 02, 2020 at 04:36:06PM +0100, Joerg Roedel wrote:
> On Thu, Feb 20, 2020 at 07:15:14PM +0100, Maxime Ripard wrote:
> > +struct sun50i_iommu_domain {
> > +   struct iommu_domain domain;
> > +
> > +   /* Number of devices attached to the domain */
> > +   refcount_t refcnt;
> > +
> > +   /* Lock to modify the Directory Table */
> > +   spinlock_t dt_lock;
>
> I suggest you make page-table updates lock-less. Otherwise this lock
> will become a bottle-neck when using the IOMMU through DMA-API.

As far as I understand it, the page table can be accessed concurrently
since the framework doesn't seem to provide any serialization /
locking, shouldn't we have some locks to prevent concurrent access?

> > +
> > +static int sun50i_iommu_map(struct iommu_domain *domain, unsigned long 
> > iova,
> > +   phys_addr_t paddr, size_t size, int prot, gfp_t gfp)
> > +{
> > +   struct sun50i_iommu_domain *sun50i_domain = to_sun50i_domain(domain);
> > +   struct sun50i_iommu *iommu = sun50i_domain->iommu;
> > +   u32 pte_index;
> > +   u32 *page_table, *pte_addr;
> > +   unsigned long flags;
> > +   int ret = 0;
> > +
> > +   spin_lock_irqsave(_domain->dt_lock, flags);
> > +   page_table = sun50i_dte_get_page_table(sun50i_domain, iova, gfp);
> > +   if (IS_ERR(page_table)) {
> > +   ret = PTR_ERR(page_table);
> > +   goto out;
> > +   }
> > +
> > +   pte_index = sun50i_iova_get_pte_index(iova);
> > +   pte_addr = _table[pte_index];
> > +   if (sun50i_pte_is_page_valid(*pte_addr)) {
>
> You can use unlikely() here.
>
> > +   phys_addr_t page_phys = sun50i_pte_get_page_address(*pte_addr);
> > +   dev_err(iommu->dev,
> > +   "iova %pad already mapped to %pa cannot remap to %pa 
> > prot: %#x\n",
> > +   , _phys, , prot);
> > +   ret = -EBUSY;
> > +   goto out;
> > +   }
> > +
> > +   *pte_addr = sun50i_mk_pte(paddr, prot);
> > +   sun50i_table_flush(sun50i_domain, pte_addr, 1);
>
> This maps only one page, right? But the function needs to map up to
> 'size' as given in the parameter list.

It does, but pgsize_bitmap is set to 4k only (since the hardware only
supports that), so we would have multiple calls to map, each time with
a single page judging from:
https://elixir.bootlin.com/linux/latest/source/drivers/iommu/iommu.c#L1948

Right?

> > +
> > +   spin_lock_irqsave(>iommu_lock, flags);
> > +   sun50i_iommu_tlb_invalidate(iommu, iova);
> > +   spin_unlock_irqrestore(>iommu_lock, flags);
>
> Why is there a need to flush the TLB here? The IOMMU-API provides
> call-backs so that the user of the API can decide when it wants
> to flush the IO/TLB. Such flushes are usually expensive and doing them
> on every map and unmap will cost significant performance.

The vendor driver was doing something along those lines and I wanted
to be conservative with the cache management if we didn't run into
performances issues, but I'll convert to the iotlb callbacks then.

Thanks!
Maxime


signature.asc
Description: PGP signature
___
iommu mailing list
iommu@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/iommu

[RFC PATCH v2] iommu/virtio: Use page size bitmap supported by endpoint

2020-04-01 Thread Bharat Bhushan
Different endpoint can support different page size, probe
endpoint if it supports specific page size otherwise use
global page sizes.

Signed-off-by: Bharat Bhushan 
---
 drivers/iommu/virtio-iommu.c  | 33 +++
 include/uapi/linux/virtio_iommu.h |  7 +++
 2 files changed, 36 insertions(+), 4 deletions(-)

diff --git a/drivers/iommu/virtio-iommu.c b/drivers/iommu/virtio-iommu.c
index cce329d71fba..c794cb5b7b3e 100644
--- a/drivers/iommu/virtio-iommu.c
+++ b/drivers/iommu/virtio-iommu.c
@@ -78,6 +78,7 @@ struct viommu_endpoint {
struct viommu_dev   *viommu;
struct viommu_domain*vdomain;
struct list_headresv_regions;
+   u64 pgsize_bitmap;
 };
 
 struct viommu_request {
@@ -415,6 +416,20 @@ static int viommu_replay_mappings(struct viommu_domain 
*vdomain)
return ret;
 }
 
+static int viommu_set_pgsize_bitmap(struct viommu_endpoint *vdev,
+   struct virtio_iommu_probe_pgsize_mask *mask,
+   size_t len)
+
+{
+   u64 pgsize_bitmap = le64_to_cpu(mask->pgsize_bitmap);
+
+   if (len < sizeof(*mask))
+   return -EINVAL;
+
+   vdev->pgsize_bitmap = pgsize_bitmap;
+   return 0;
+}
+
 static int viommu_add_resv_mem(struct viommu_endpoint *vdev,
   struct virtio_iommu_probe_resv_mem *mem,
   size_t len)
@@ -494,11 +509,13 @@ static int viommu_probe_endpoint(struct viommu_dev 
*viommu, struct device *dev)
while (type != VIRTIO_IOMMU_PROBE_T_NONE &&
   cur < viommu->probe_size) {
len = le16_to_cpu(prop->length) + sizeof(*prop);
-
switch (type) {
case VIRTIO_IOMMU_PROBE_T_RESV_MEM:
ret = viommu_add_resv_mem(vdev, (void *)prop, len);
break;
+   case VIRTIO_IOMMU_PROBE_T_PAGE_SIZE_MASK:
+   ret = viommu_set_pgsize_bitmap(vdev, (void *)prop, len);
+   break;
default:
dev_err(dev, "unknown viommu prop 0x%x\n", type);
}
@@ -607,16 +624,23 @@ static struct iommu_domain *viommu_domain_alloc(unsigned 
type)
return >domain;
 }
 
-static int viommu_domain_finalise(struct viommu_dev *viommu,
+static int viommu_domain_finalise(struct viommu_endpoint *vdev,
  struct iommu_domain *domain)
 {
int ret;
struct viommu_domain *vdomain = to_viommu_domain(domain);
+   struct viommu_dev *viommu = vdev->viommu;
 
vdomain->viommu = viommu;
vdomain->map_flags  = viommu->map_flags;
 
-   domain->pgsize_bitmap   = viommu->pgsize_bitmap;
+   /* Devices in same domain must support same size pages */
+   if ((domain->pgsize_bitmap != viommu->pgsize_bitmap) &&
+   (domain->pgsize_bitmap != vdev->pgsize_bitmap))
+   return -EINVAL;
+
+   domain->pgsize_bitmap = vdev->pgsize_bitmap;
+
domain->geometry= viommu->geometry;
 
ret = ida_alloc_range(>domain_ids, viommu->first_domain,
@@ -657,7 +681,7 @@ static int viommu_attach_dev(struct iommu_domain *domain, 
struct device *dev)
 * Properly initialize the domain now that we know which viommu
 * owns it.
 */
-   ret = viommu_domain_finalise(vdev->viommu, domain);
+   ret = viommu_domain_finalise(vdev, domain);
} else if (vdomain->viommu != vdev->viommu) {
dev_err(dev, "cannot attach to foreign vIOMMU\n");
ret = -EXDEV;
@@ -875,6 +899,7 @@ static int viommu_add_device(struct device *dev)
 
vdev->dev = dev;
vdev->viommu = viommu;
+   vdev->pgsize_bitmap = viommu->pgsize_bitmap;
INIT_LIST_HEAD(>resv_regions);
fwspec->iommu_priv = vdev;
 
diff --git a/include/uapi/linux/virtio_iommu.h 
b/include/uapi/linux/virtio_iommu.h
index 237e36a280cb..dc9d3f40bcd8 100644
--- a/include/uapi/linux/virtio_iommu.h
+++ b/include/uapi/linux/virtio_iommu.h
@@ -111,6 +111,7 @@ struct virtio_iommu_req_unmap {
 
 #define VIRTIO_IOMMU_PROBE_T_NONE  0
 #define VIRTIO_IOMMU_PROBE_T_RESV_MEM  1
+#define VIRTIO_IOMMU_PROBE_T_PAGE_SIZE_MASK2
 
 #define VIRTIO_IOMMU_PROBE_T_MASK  0xfff
 
@@ -119,6 +120,12 @@ struct virtio_iommu_probe_property {
__le16  length;
 };
 
+struct virtio_iommu_probe_pgsize_mask {
+   struct virtio_iommu_probe_property  head;
+   __u8reserved[4];
+   __u64   pgsize_bitmap;
+};
+
 #define VIRTIO_IOMMU_RESV_MEM_T_RESERVED   0
 #define VIRTIO_IOMMU_RESV_MEM_T_MSI1
 
-- 
2.17.1

___
iommu mailing list

[PATCH] vfio: Ignore -ENODEV when getting MSI cookie

2020-04-01 Thread Andre Przywara
When we try to get an MSI cookie for a VFIO device, that can fail if
CONFIG_IOMMU_DMA is not set. In this case iommu_get_msi_cookie() returns
-ENODEV, and that should not be fatal.

Ignore that case and proceed with the initialisation.

This fixes VFIO with a platform device on the Calxeda Midway (no MSIs).

Fixes: f6810c15cf973f ("iommu/arm-smmu: Clean up early-probing workarounds")
Signed-off-by: Andre Przywara 
Acked-by: Robin Murphy 
Reviewed-by: Eric Auger 
---
 drivers/vfio/vfio_iommu_type1.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/vfio/vfio_iommu_type1.c b/drivers/vfio/vfio_iommu_type1.c
index 9fdfae1cb17a..85b32c325282 100644
--- a/drivers/vfio/vfio_iommu_type1.c
+++ b/drivers/vfio/vfio_iommu_type1.c
@@ -1787,7 +1787,7 @@ static int vfio_iommu_type1_attach_group(void *iommu_data,
 
if (resv_msi) {
ret = iommu_get_msi_cookie(domain->domain, resv_msi_base);
-   if (ret)
+   if (ret && ret != -ENODEV)
goto out_detach;
}
 
-- 
2.17.1

___
iommu mailing list
iommu@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/iommu


Re: [PATCH v1 3/8] vfio/type1: Report PASID alloc/free support to userspace

2020-04-01 Thread Auger Eric
Yi,
On 3/22/20 1:32 PM, Liu, Yi L wrote:
> From: Liu Yi L 
> 
> This patch reports PASID alloc/free availability to userspace (e.g. QEMU)
> thus userspace could do a pre-check before utilizing this feature.
> 
> Cc: Kevin Tian 
> CC: Jacob Pan 
> Cc: Alex Williamson 
> Cc: Eric Auger 
> Cc: Jean-Philippe Brucker 
> Signed-off-by: Liu Yi L 
> ---
>  drivers/vfio/vfio_iommu_type1.c | 28 
>  include/uapi/linux/vfio.h   |  8 
>  2 files changed, 36 insertions(+)
> 
> diff --git a/drivers/vfio/vfio_iommu_type1.c b/drivers/vfio/vfio_iommu_type1.c
> index e40afc0..ddd1ffe 100644
> --- a/drivers/vfio/vfio_iommu_type1.c
> +++ b/drivers/vfio/vfio_iommu_type1.c
> @@ -2234,6 +2234,30 @@ static int vfio_iommu_type1_pasid_free(struct 
> vfio_iommu *iommu,
>   return ret;
>  }
>  
> +static int vfio_iommu_info_add_nesting_cap(struct vfio_iommu *iommu,
> +  struct vfio_info_cap *caps)
> +{
> + struct vfio_info_cap_header *header;
> + struct vfio_iommu_type1_info_cap_nesting *nesting_cap;
> +
> + header = vfio_info_cap_add(caps, sizeof(*nesting_cap),
> +VFIO_IOMMU_TYPE1_INFO_CAP_NESTING, 1);
> + if (IS_ERR(header))
> + return PTR_ERR(header);
> +
> + nesting_cap = container_of(header,
> + struct vfio_iommu_type1_info_cap_nesting,
> + header);
> +
> + nesting_cap->nesting_capabilities = 0;
> + if (iommu->nesting) {
> + /* nesting iommu type supports PASID requests (alloc/free) */
> + nesting_cap->nesting_capabilities |= VFIO_IOMMU_PASID_REQS;
Supporting nesting does not necessarily mean supporting PASID.
> + }
> +
> + return 0;
> +}
> +
>  static long vfio_iommu_type1_ioctl(void *iommu_data,
>  unsigned int cmd, unsigned long arg)
>  {
> @@ -2283,6 +2307,10 @@ static long vfio_iommu_type1_ioctl(void *iommu_data,
>   if (ret)
>   return ret;
>  
> + ret = vfio_iommu_info_add_nesting_cap(iommu, );
> + if (ret)
> + return ret;
> +
>   if (caps.size) {
>   info.flags |= VFIO_IOMMU_INFO_CAPS;
>  
> diff --git a/include/uapi/linux/vfio.h b/include/uapi/linux/vfio.h
> index 298ac80..8837219 100644
> --- a/include/uapi/linux/vfio.h
> +++ b/include/uapi/linux/vfio.h
> @@ -748,6 +748,14 @@ struct vfio_iommu_type1_info_cap_iova_range {
>   struct  vfio_iova_range iova_ranges[];
>  };
>  
> +#define VFIO_IOMMU_TYPE1_INFO_CAP_NESTING  2
> +
> +struct vfio_iommu_type1_info_cap_nesting {
> + struct  vfio_info_cap_header header;
> +#define VFIO_IOMMU_PASID_REQS(1 << 0)
PASID_REQS sounds a bit far from the claimed host managed alloc/free
capability.
VFIO_IOMMU_SYSTEM_WIDE_PASID?
> + __u32   nesting_capabilities;
> +};
> +
>  #define VFIO_IOMMU_GET_INFO _IO(VFIO_TYPE, VFIO_BASE + 12)
>  
>  /**
> 
Thanks

Eric

___
iommu mailing list
iommu@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/iommu


RE: [PATCH v1 6/8] vfio/type1: Bind guest page tables to host

2020-04-01 Thread Liu, Yi L
> From: Tian, Kevin 
> Sent: Monday, March 30, 2020 8:46 PM
> Subject: RE: [PATCH v1 6/8] vfio/type1: Bind guest page tables to host
> 
> > From: Liu, Yi L 
> > Sent: Sunday, March 22, 2020 8:32 PM
> >
> > From: Liu Yi L 
> >
> > VFIO_TYPE1_NESTING_IOMMU is an IOMMU type which is backed by hardware
> > IOMMUs that have nesting DMA translation (a.k.a dual stage address
> > translation). For such hardware IOMMUs, there are two stages/levels of
> > address translation, and software may let userspace/VM to own the
> > first-
> > level/stage-1 translation structures. Example of such usage is vSVA (
> > virtual Shared Virtual Addressing). VM owns the first-level/stage-1
> > translation structures and bind the structures to host, then hardware
> > IOMMU would utilize nesting translation when doing DMA translation fo
> > the devices behind such hardware IOMMU.
> >
> > This patch adds vfio support for binding guest translation (a.k.a
> > stage 1) structure to host iommu. And for VFIO_TYPE1_NESTING_IOMMU,
> > not only bind guest page table is needed, it also requires to expose
> > interface to guest for iommu cache invalidation when guest modified
> > the first-level/stage-1 translation structures since hardware needs to
> > be notified to flush stale iotlbs. This would be introduced in next
> > patch.
> >
> > In this patch, guest page table bind and unbind are done by using
> > flags VFIO_IOMMU_BIND_GUEST_PGTBL and
> VFIO_IOMMU_UNBIND_GUEST_PGTBL
> > under IOCTL VFIO_IOMMU_BIND, the bind/unbind data are conveyed by
> > struct iommu_gpasid_bind_data. Before binding guest page table to
> > host, VM should have got a PASID allocated by host via
> > VFIO_IOMMU_PASID_REQUEST.
> >
> > Bind guest translation structures (here is guest page table) to host
> 
> Bind -> Binding
got it.
> > are the first step to setup vSVA (Virtual Shared Virtual Addressing).
> 
> are -> is. and you already explained vSVA earlier.
oh yes, it is.
> >
> > Cc: Kevin Tian 
> > CC: Jacob Pan 
> > Cc: Alex Williamson 
> > Cc: Eric Auger 
> > Cc: Jean-Philippe Brucker 
> > Signed-off-by: Jean-Philippe Brucker 
> > Signed-off-by: Liu Yi L 
> > Signed-off-by: Jacob Pan 
> > ---
> >  drivers/vfio/vfio_iommu_type1.c | 158
> > 
> >  include/uapi/linux/vfio.h   |  46 
> >  2 files changed, 204 insertions(+)
> >
> > diff --git a/drivers/vfio/vfio_iommu_type1.c
> > b/drivers/vfio/vfio_iommu_type1.c index 82a9e0b..a877747 100644
> > --- a/drivers/vfio/vfio_iommu_type1.c
> > +++ b/drivers/vfio/vfio_iommu_type1.c
> > @@ -130,6 +130,33 @@ struct vfio_regions {
> >  #define IS_IOMMU_CAP_DOMAIN_IN_CONTAINER(iommu)\
> > (!list_empty(>domain_list))
> >
> > +struct domain_capsule {
> > +   struct iommu_domain *domain;
> > +   void *data;
> > +};
> > +
> > +/* iommu->lock must be held */
> > +static int vfio_iommu_for_each_dev(struct vfio_iommu *iommu,
> > + int (*fn)(struct device *dev, void *data),
> > + void *data)
> > +{
> > +   struct domain_capsule dc = {.data = data};
> > +   struct vfio_domain *d;
> > +   struct vfio_group *g;
> > +   int ret = 0;
> > +
> > +   list_for_each_entry(d, >domain_list, next) {
> > +   dc.domain = d->domain;
> > +   list_for_each_entry(g, >group_list, next) {
> > +   ret = iommu_group_for_each_dev(g->iommu_group,
> > +  , fn);
> > +   if (ret)
> > +   break;
> > +   }
> > +   }
> > +   return ret;
> > +}
> > +
> >  static int put_pfn(unsigned long pfn, int prot);
> >
> >  /*
> > @@ -2314,6 +2341,88 @@ static int
> > vfio_iommu_info_add_nesting_cap(struct vfio_iommu *iommu,
> > return 0;
> >  }
> >
> > +static int vfio_bind_gpasid_fn(struct device *dev, void *data) {
> > +   struct domain_capsule *dc = (struct domain_capsule *)data;
> > +   struct iommu_gpasid_bind_data *gbind_data =
> > +   (struct iommu_gpasid_bind_data *) dc->data;
> > +
> 
> In Jacob's vSVA iommu series, [PATCH 06/11]:
> 
> + /* REVISIT: upper layer/VFIO can track host process that bind 
> the
> PASID.
> +  * ioasid_set = mm might be sufficient for vfio to check pasid 
> VMM
> +  * ownership.
> +  */
> 
> I asked him who exactly should be responsible for tracking the pasid 
> ownership.
> Although no response yet, I expect vfio/iommu can have a clear policy and also
> documented here to provide consistent message.

yep.

> > +   return iommu_sva_bind_gpasid(dc->domain, dev, gbind_data); }
> > +
> > +static int vfio_unbind_gpasid_fn(struct device *dev, void *data) {
> > +   struct domain_capsule *dc = (struct domain_capsule *)data;
> > +   struct iommu_gpasid_bind_data *gbind_data =
> > +   (struct iommu_gpasid_bind_data *) dc->data;
> > +
> > +   return iommu_sva_unbind_gpasid(dc->domain, dev,
> > +   

Re: [PATCH v1 5/8] vfio/type1: Report 1st-level/stage-1 format to userspace

2020-04-01 Thread Auger Eric
Hi Yi,
On 3/22/20 1:32 PM, Liu, Yi L wrote:
> From: Liu Yi L 
> 
> VFIO exposes IOMMU nesting translation (a.k.a dual stage translation)
> capability to userspace. Thus applications like QEMU could support
> vIOMMU with hardware's nesting translation capability for pass-through
> devices. Before setting up nesting translation for pass-through devices,
> QEMU and other applications need to learn the supported 1st-lvl/stage-1
> translation structure format like page table format.
> 
> Take vSVA (virtual Shared Virtual Addressing) as an example, to support
> vSVA for pass-through devices, QEMU setup nesting translation for pass-
> through devices. The guest page table are configured to host as 1st-lvl/
> stage-1 page table. Therefore, guest format should be compatible with
> host side.
> 
> This patch reports the supported 1st-lvl/stage-1 page table format on the
> current platform to userspace. QEMU and other alike applications should
> use this format info when trying to setup IOMMU nesting translation on
> host IOMMU.
> 
> Cc: Kevin Tian 
> CC: Jacob Pan 
> Cc: Alex Williamson 
> Cc: Eric Auger 
> Cc: Jean-Philippe Brucker 
> Signed-off-by: Liu Yi L 
> ---
>  drivers/vfio/vfio_iommu_type1.c | 56 
> +
>  include/uapi/linux/vfio.h   |  1 +
>  2 files changed, 57 insertions(+)
> 
> diff --git a/drivers/vfio/vfio_iommu_type1.c b/drivers/vfio/vfio_iommu_type1.c
> index 9aa2a67..82a9e0b 100644
> --- a/drivers/vfio/vfio_iommu_type1.c
> +++ b/drivers/vfio/vfio_iommu_type1.c
> @@ -2234,11 +2234,66 @@ static int vfio_iommu_type1_pasid_free(struct 
> vfio_iommu *iommu,
>   return ret;
>  }
>  
> +static int vfio_iommu_get_stage1_format(struct vfio_iommu *iommu,
> +  u32 *stage1_format)
vfio_pasid_format() to be homogeneous with vfio_pgsize_bitmap() which
does the same kind of enumeration of the vfio_iommu domains
> +{
> + struct vfio_domain *domain;
> + u32 format = 0, tmp_format = 0;
> + int ret;
ret = -EINVAL;
> +
> + mutex_lock(>lock);
> + if (list_empty(>domain_list)) {
goto out_unlock;
> + mutex_unlock(>lock);
> + return -EINVAL;
> + }
> +
> + list_for_each_entry(domain, >domain_list, next) {
> + if (iommu_domain_get_attr(domain->domain,
> + DOMAIN_ATTR_PASID_FORMAT, )) {
I can find DOMAIN_ATTR_PASID_FORMAT in Jacob's v9 but not in v10
> + ret = -EINVAL;
could be removed
> + format = 0;
> + goto out_unlock;
> + }
> + /*
> +  * format is always non-zero (the first format is
> +  * IOMMU_PASID_FORMAT_INTEL_VTD which is 1). For
> +  * the reason of potential different backed IOMMU
> +  * formats, here we expect to have identical formats
> +  * in the domain list, no mixed formats support.
> +  * return -EINVAL to fail the attempt of setup
> +  * VFIO_TYPE1_NESTING_IOMMU if non-identical formats
> +  * are detected.
> +  */
> + if (tmp_format && tmp_format != format) {
> + ret = -EINVAL;
could be removed
> + format = 0;
> + goto out_unlock;
> + }
> +
> + tmp_format = format;
> + }
> + ret = 0;
> +
> +out_unlock:
> + if (format)
if (!ret) ? then you can remove the format = 0 in case of error.
> + *stage1_format = format;
> + mutex_unlock(>lock);
> + return ret;
> +}
> +
>  static int vfio_iommu_info_add_nesting_cap(struct vfio_iommu *iommu,
>struct vfio_info_cap *caps)
>  {
>   struct vfio_info_cap_header *header;
>   struct vfio_iommu_type1_info_cap_nesting *nesting_cap;
> + u32 formats = 0;
> + int ret;
> +
> + ret = vfio_iommu_get_stage1_format(iommu, );
> + if (ret) {
> + pr_warn("Failed to get stage-1 format\n");
trace triggered by userspace to be removed?
> + return ret;
> + }
>  
>   header = vfio_info_cap_add(caps, sizeof(*nesting_cap),
>  VFIO_IOMMU_TYPE1_INFO_CAP_NESTING, 1);
> @@ -2254,6 +2309,7 @@ static int vfio_iommu_info_add_nesting_cap(struct 
> vfio_iommu *iommu,
>   /* nesting iommu type supports PASID requests (alloc/free) */
>   nesting_cap->nesting_capabilities |= VFIO_IOMMU_PASID_REQS;
What is the meaning for ARM?
>   }
> + nesting_cap->stage1_formats = formats;
as spotted by Kevin, since a single format is supported, rename
>  
>   return 0;
>  }
> diff --git a/include/uapi/linux/vfio.h b/include/uapi/linux/vfio.h
> index ed9881d..ebeaf3e 100644
> --- a/include/uapi/linux/vfio.h
> +++ b/include/uapi/linux/vfio.h
> @@ -763,6 +763,7 @@ struct vfio_iommu_type1_info_cap_nesting {
>   struct  vfio_info_cap_header header;
>  #define VFIO_IOMMU_PASID_REQS   

RE: [PATCH v1 5/8] vfio/type1: Report 1st-level/stage-1 format to userspace

2020-04-01 Thread Liu, Yi L
> From: Tian, Kevin 
> Sent: Wednesday, April 1, 2020 4:09 PM
> Subject: RE: [PATCH v1 5/8] vfio/type1: Report 1st-level/stage-1 format to
> userspace
> 
> > From: Liu, Yi L 
> > Sent: Wednesday, April 1, 2020 4:07 PM
> >
> > > From: Tian, Kevin 
> > > Sent: Wednesday, April 1, 2020 3:56 PM
> > > To: Liu, Yi L ; alex.william...@redhat.com;
> > > Subject: RE: [PATCH v1 5/8] vfio/type1: Report 1st-level/stage-1
> > > format to userspace
> > >
> > > > From: Liu, Yi L 
> > > > Sent: Wednesday, April 1, 2020 3:38 PM
> > > >
> > > >  > From: Tian, Kevin 
> > > > > Sent: Monday, March 30, 2020 7:49 PM
> > > > > To: Liu, Yi L ; alex.william...@redhat.com;
> > > > > Subject: RE: [PATCH v1 5/8] vfio/type1: Report 1st-level/stage-1
> > > > > format to userspace
> > > > >
> > > > > > From: Liu, Yi L 
> > > > > > Sent: Sunday, March 22, 2020 8:32 PM
> > > > > >
> > > > > > From: Liu Yi L 
> > > > > >
> > > > > > VFIO exposes IOMMU nesting translation (a.k.a dual stage
> > > > > > translation) capability to userspace. Thus applications like
> > > > > > QEMU could support vIOMMU with hardware's nesting translation
> > > > > > capability for pass-through devices. Before setting up nesting
> > > > > > translation for pass-through devices, QEMU and other
> > > > > > applications need to learn the supported
> > > > > > 1st-lvl/stage-1 translation structure format like page table format.
> > > > > >
> > > > > > Take vSVA (virtual Shared Virtual Addressing) as an example,
> > > > > > to support vSVA for pass-through devices, QEMU setup nesting
> > > > > > translation for pass- through devices. The guest page table
> > > > > > are configured to host as 1st-lvl/
> > > > > > stage-1 page table. Therefore, guest format should be
> > > > > > compatible with host side.
> > > > > >
> > > > > > This patch reports the supported 1st-lvl/stage-1 page table
> > > > > > format on the current platform to userspace. QEMU and other
> > > > > > alike applications should use this format info when trying to
> > > > > > setup IOMMU nesting translation on host IOMMU.
> > > > > >
> > > > > > Cc: Kevin Tian 
> > > > > > CC: Jacob Pan 
> > > > > > Cc: Alex Williamson 
> > > > > > Cc: Eric Auger 
> > > > > > Cc: Jean-Philippe Brucker 
> > > > > > Signed-off-by: Liu Yi L 
> > > > > > ---
> > > > > >  drivers/vfio/vfio_iommu_type1.c | 56
> > > > > > +
> > > > > >  include/uapi/linux/vfio.h   |  1 +
> > > > > >  2 files changed, 57 insertions(+)
> > > > > >
> > > > > > diff --git a/drivers/vfio/vfio_iommu_type1.c
> > > > > > b/drivers/vfio/vfio_iommu_type1.c index 9aa2a67..82a9e0b
> > > > > > 100644
> > > > > > --- a/drivers/vfio/vfio_iommu_type1.c
> > > > > > +++ b/drivers/vfio/vfio_iommu_type1.c
> > > > > > @@ -2234,11 +2234,66 @@ static int
> > > > vfio_iommu_type1_pasid_free(struct
> > > > > > vfio_iommu *iommu,
> > > > > > return ret;
> > > > > >  }
> > > > > >
> > > > > > +static int vfio_iommu_get_stage1_format(struct vfio_iommu *iommu,
> > > > > > +u32 *stage1_format)
> > > > > > +{
> > > > > > +   struct vfio_domain *domain;
> > > > > > +   u32 format = 0, tmp_format = 0;
> > > > > > +   int ret;
> > > > > > +
> > > > > > +   mutex_lock(>lock);
> > > > > > +   if (list_empty(>domain_list)) {
> > > > > > +   mutex_unlock(>lock);
> > > > > > +   return -EINVAL;
> > > > > > +   }
> > > > > > +
> > > > > > +   list_for_each_entry(domain, >domain_list, next) {
> > > > > > +   if (iommu_domain_get_attr(domain->domain,
> > > > > > +   DOMAIN_ATTR_PASID_FORMAT, )) {
> > > > > > +   ret = -EINVAL;
> > > > > > +   format = 0;
> > > > > > +   goto out_unlock;
> > > > > > +   }
> > > > > > +   /*
> > > > > > +* format is always non-zero (the first format is
> > > > > > +* IOMMU_PASID_FORMAT_INTEL_VTD which is 1).
> > For
> > > > > > +* the reason of potential different backed IOMMU
> > > > > > +* formats, here we expect to have identical formats
> > > > > > +* in the domain list, no mixed formats support.
> > > > > > +* return -EINVAL to fail the attempt of setup
> > > > > > +* VFIO_TYPE1_NESTING_IOMMU if non-identical
> > formats
> > > > > > +* are detected.
> > > > > > +*/
> > > > > > +   if (tmp_format && tmp_format != format) {
> > > > > > +   ret = -EINVAL;
> > > > > > +   format = 0;
> > > > > > +   goto out_unlock;
> > > > > > +   }
> > > > > > +
> > > > > > +   tmp_format = format;
> > > > > > +   }
> > > > >
> > > > > this path is invoked only in VFIO_IOMMU_GET_INFO path. If we
> > > > > don't want
> > > > to
> > > > > assume the status quo that one container holds only one device
> > > > > w/
> > > > vIOMMU
> > > > > (the prerequisite for vSVA), looks we also need check the format
> > 

RE: [PATCH v1 5/8] vfio/type1: Report 1st-level/stage-1 format to userspace

2020-04-01 Thread Tian, Kevin
> From: Liu, Yi L 
> Sent: Wednesday, April 1, 2020 4:07 PM
> 
> > From: Tian, Kevin 
> > Sent: Wednesday, April 1, 2020 3:56 PM
> > To: Liu, Yi L ; alex.william...@redhat.com;
> > Subject: RE: [PATCH v1 5/8] vfio/type1: Report 1st-level/stage-1 format to
> > userspace
> >
> > > From: Liu, Yi L 
> > > Sent: Wednesday, April 1, 2020 3:38 PM
> > >
> > >  > From: Tian, Kevin 
> > > > Sent: Monday, March 30, 2020 7:49 PM
> > > > To: Liu, Yi L ; alex.william...@redhat.com;
> > > > Subject: RE: [PATCH v1 5/8] vfio/type1: Report 1st-level/stage-1
> > > > format to userspace
> > > >
> > > > > From: Liu, Yi L 
> > > > > Sent: Sunday, March 22, 2020 8:32 PM
> > > > >
> > > > > From: Liu Yi L 
> > > > >
> > > > > VFIO exposes IOMMU nesting translation (a.k.a dual stage
> > > > > translation) capability to userspace. Thus applications like QEMU
> > > > > could support vIOMMU with hardware's nesting translation
> > > > > capability for pass-through devices. Before setting up nesting
> > > > > translation for pass-through devices, QEMU and other applications
> > > > > need to learn the supported
> > > > > 1st-lvl/stage-1 translation structure format like page table format.
> > > > >
> > > > > Take vSVA (virtual Shared Virtual Addressing) as an example, to
> > > > > support vSVA for pass-through devices, QEMU setup nesting
> > > > > translation for pass- through devices. The guest page table are
> > > > > configured to host as 1st-lvl/
> > > > > stage-1 page table. Therefore, guest format should be compatible
> > > > > with host side.
> > > > >
> > > > > This patch reports the supported 1st-lvl/stage-1 page table format
> > > > > on the current platform to userspace. QEMU and other alike
> > > > > applications should use this format info when trying to setup
> > > > > IOMMU nesting translation on host IOMMU.
> > > > >
> > > > > Cc: Kevin Tian 
> > > > > CC: Jacob Pan 
> > > > > Cc: Alex Williamson 
> > > > > Cc: Eric Auger 
> > > > > Cc: Jean-Philippe Brucker 
> > > > > Signed-off-by: Liu Yi L 
> > > > > ---
> > > > >  drivers/vfio/vfio_iommu_type1.c | 56
> > > > > +
> > > > >  include/uapi/linux/vfio.h   |  1 +
> > > > >  2 files changed, 57 insertions(+)
> > > > >
> > > > > diff --git a/drivers/vfio/vfio_iommu_type1.c
> > > > > b/drivers/vfio/vfio_iommu_type1.c index 9aa2a67..82a9e0b 100644
> > > > > --- a/drivers/vfio/vfio_iommu_type1.c
> > > > > +++ b/drivers/vfio/vfio_iommu_type1.c
> > > > > @@ -2234,11 +2234,66 @@ static int
> > > vfio_iommu_type1_pasid_free(struct
> > > > > vfio_iommu *iommu,
> > > > >   return ret;
> > > > >  }
> > > > >
> > > > > +static int vfio_iommu_get_stage1_format(struct vfio_iommu *iommu,
> > > > > +  u32 *stage1_format)
> > > > > +{
> > > > > + struct vfio_domain *domain;
> > > > > + u32 format = 0, tmp_format = 0;
> > > > > + int ret;
> > > > > +
> > > > > + mutex_lock(>lock);
> > > > > + if (list_empty(>domain_list)) {
> > > > > + mutex_unlock(>lock);
> > > > > + return -EINVAL;
> > > > > + }
> > > > > +
> > > > > + list_for_each_entry(domain, >domain_list, next) {
> > > > > + if (iommu_domain_get_attr(domain->domain,
> > > > > + DOMAIN_ATTR_PASID_FORMAT, )) {
> > > > > + ret = -EINVAL;
> > > > > + format = 0;
> > > > > + goto out_unlock;
> > > > > + }
> > > > > + /*
> > > > > +  * format is always non-zero (the first format is
> > > > > +  * IOMMU_PASID_FORMAT_INTEL_VTD which is 1).
> For
> > > > > +  * the reason of potential different backed IOMMU
> > > > > +  * formats, here we expect to have identical formats
> > > > > +  * in the domain list, no mixed formats support.
> > > > > +  * return -EINVAL to fail the attempt of setup
> > > > > +  * VFIO_TYPE1_NESTING_IOMMU if non-identical
> formats
> > > > > +  * are detected.
> > > > > +  */
> > > > > + if (tmp_format && tmp_format != format) {
> > > > > + ret = -EINVAL;
> > > > > + format = 0;
> > > > > + goto out_unlock;
> > > > > + }
> > > > > +
> > > > > + tmp_format = format;
> > > > > + }
> > > >
> > > > this path is invoked only in VFIO_IOMMU_GET_INFO path. If we don't
> > > > want
> > > to
> > > > assume the status quo that one container holds only one device w/
> > > vIOMMU
> > > > (the prerequisite for vSVA), looks we also need check the format
> > > > compatibility when attaching a new group to this container?
> > >
> > > right. if attaching to a nesting type container (vfio_iommu.nesting
> > > bit indicates it), it should check if it is compabile with prior
> > > domains in the domain list. But if it is the first one attached to
> > > this container, it's fine. is it 

RE: [PATCH v1 5/8] vfio/type1: Report 1st-level/stage-1 format to userspace

2020-04-01 Thread Liu, Yi L
> From: Tian, Kevin 
> Sent: Wednesday, April 1, 2020 3:56 PM
> To: Liu, Yi L ; alex.william...@redhat.com;
> Subject: RE: [PATCH v1 5/8] vfio/type1: Report 1st-level/stage-1 format to
> userspace
> 
> > From: Liu, Yi L 
> > Sent: Wednesday, April 1, 2020 3:38 PM
> >
> >  > From: Tian, Kevin 
> > > Sent: Monday, March 30, 2020 7:49 PM
> > > To: Liu, Yi L ; alex.william...@redhat.com;
> > > Subject: RE: [PATCH v1 5/8] vfio/type1: Report 1st-level/stage-1
> > > format to userspace
> > >
> > > > From: Liu, Yi L 
> > > > Sent: Sunday, March 22, 2020 8:32 PM
> > > >
> > > > From: Liu Yi L 
> > > >
> > > > VFIO exposes IOMMU nesting translation (a.k.a dual stage
> > > > translation) capability to userspace. Thus applications like QEMU
> > > > could support vIOMMU with hardware's nesting translation
> > > > capability for pass-through devices. Before setting up nesting
> > > > translation for pass-through devices, QEMU and other applications
> > > > need to learn the supported
> > > > 1st-lvl/stage-1 translation structure format like page table format.
> > > >
> > > > Take vSVA (virtual Shared Virtual Addressing) as an example, to
> > > > support vSVA for pass-through devices, QEMU setup nesting
> > > > translation for pass- through devices. The guest page table are
> > > > configured to host as 1st-lvl/
> > > > stage-1 page table. Therefore, guest format should be compatible
> > > > with host side.
> > > >
> > > > This patch reports the supported 1st-lvl/stage-1 page table format
> > > > on the current platform to userspace. QEMU and other alike
> > > > applications should use this format info when trying to setup
> > > > IOMMU nesting translation on host IOMMU.
> > > >
> > > > Cc: Kevin Tian 
> > > > CC: Jacob Pan 
> > > > Cc: Alex Williamson 
> > > > Cc: Eric Auger 
> > > > Cc: Jean-Philippe Brucker 
> > > > Signed-off-by: Liu Yi L 
> > > > ---
> > > >  drivers/vfio/vfio_iommu_type1.c | 56
> > > > +
> > > >  include/uapi/linux/vfio.h   |  1 +
> > > >  2 files changed, 57 insertions(+)
> > > >
> > > > diff --git a/drivers/vfio/vfio_iommu_type1.c
> > > > b/drivers/vfio/vfio_iommu_type1.c index 9aa2a67..82a9e0b 100644
> > > > --- a/drivers/vfio/vfio_iommu_type1.c
> > > > +++ b/drivers/vfio/vfio_iommu_type1.c
> > > > @@ -2234,11 +2234,66 @@ static int
> > vfio_iommu_type1_pasid_free(struct
> > > > vfio_iommu *iommu,
> > > > return ret;
> > > >  }
> > > >
> > > > +static int vfio_iommu_get_stage1_format(struct vfio_iommu *iommu,
> > > > +u32 *stage1_format)
> > > > +{
> > > > +   struct vfio_domain *domain;
> > > > +   u32 format = 0, tmp_format = 0;
> > > > +   int ret;
> > > > +
> > > > +   mutex_lock(>lock);
> > > > +   if (list_empty(>domain_list)) {
> > > > +   mutex_unlock(>lock);
> > > > +   return -EINVAL;
> > > > +   }
> > > > +
> > > > +   list_for_each_entry(domain, >domain_list, next) {
> > > > +   if (iommu_domain_get_attr(domain->domain,
> > > > +   DOMAIN_ATTR_PASID_FORMAT, )) {
> > > > +   ret = -EINVAL;
> > > > +   format = 0;
> > > > +   goto out_unlock;
> > > > +   }
> > > > +   /*
> > > > +* format is always non-zero (the first format is
> > > > +* IOMMU_PASID_FORMAT_INTEL_VTD which is 1). For
> > > > +* the reason of potential different backed IOMMU
> > > > +* formats, here we expect to have identical formats
> > > > +* in the domain list, no mixed formats support.
> > > > +* return -EINVAL to fail the attempt of setup
> > > > +* VFIO_TYPE1_NESTING_IOMMU if non-identical formats
> > > > +* are detected.
> > > > +*/
> > > > +   if (tmp_format && tmp_format != format) {
> > > > +   ret = -EINVAL;
> > > > +   format = 0;
> > > > +   goto out_unlock;
> > > > +   }
> > > > +
> > > > +   tmp_format = format;
> > > > +   }
> > >
> > > this path is invoked only in VFIO_IOMMU_GET_INFO path. If we don't
> > > want
> > to
> > > assume the status quo that one container holds only one device w/
> > vIOMMU
> > > (the prerequisite for vSVA), looks we also need check the format
> > > compatibility when attaching a new group to this container?
> >
> > right. if attaching to a nesting type container (vfio_iommu.nesting
> > bit indicates it), it should check if it is compabile with prior
> > domains in the domain list. But if it is the first one attached to
> > this container, it's fine. is it good?
> 
> yes, but my point is whether we should check the format compatibility
> in the attach path...

I guess so. Assume a device has been attached to a container, and
userspace has fetched the nesting cap info. e.g. QEMU 

RE: [PATCH v1 5/8] vfio/type1: Report 1st-level/stage-1 format to userspace

2020-04-01 Thread Tian, Kevin
> From: Liu, Yi L 
> Sent: Wednesday, April 1, 2020 3:38 PM
> 
>  > From: Tian, Kevin 
> > Sent: Monday, March 30, 2020 7:49 PM
> > To: Liu, Yi L ; alex.william...@redhat.com;
> > Subject: RE: [PATCH v1 5/8] vfio/type1: Report 1st-level/stage-1 format to
> > userspace
> >
> > > From: Liu, Yi L 
> > > Sent: Sunday, March 22, 2020 8:32 PM
> > >
> > > From: Liu Yi L 
> > >
> > > VFIO exposes IOMMU nesting translation (a.k.a dual stage translation)
> > > capability to userspace. Thus applications like QEMU could support
> > > vIOMMU with hardware's nesting translation capability for pass-through
> > > devices. Before setting up nesting translation for pass-through
> > > devices, QEMU and other applications need to learn the supported
> > > 1st-lvl/stage-1 translation structure format like page table format.
> > >
> > > Take vSVA (virtual Shared Virtual Addressing) as an example, to
> > > support vSVA for pass-through devices, QEMU setup nesting translation
> > > for pass- through devices. The guest page table are configured to host
> > > as 1st-lvl/
> > > stage-1 page table. Therefore, guest format should be compatible with
> > > host side.
> > >
> > > This patch reports the supported 1st-lvl/stage-1 page table format on
> > > the current platform to userspace. QEMU and other alike applications
> > > should use this format info when trying to setup IOMMU nesting
> > > translation on host IOMMU.
> > >
> > > Cc: Kevin Tian 
> > > CC: Jacob Pan 
> > > Cc: Alex Williamson 
> > > Cc: Eric Auger 
> > > Cc: Jean-Philippe Brucker 
> > > Signed-off-by: Liu Yi L 
> > > ---
> > >  drivers/vfio/vfio_iommu_type1.c | 56
> > > +
> > >  include/uapi/linux/vfio.h   |  1 +
> > >  2 files changed, 57 insertions(+)
> > >
> > > diff --git a/drivers/vfio/vfio_iommu_type1.c
> > > b/drivers/vfio/vfio_iommu_type1.c index 9aa2a67..82a9e0b 100644
> > > --- a/drivers/vfio/vfio_iommu_type1.c
> > > +++ b/drivers/vfio/vfio_iommu_type1.c
> > > @@ -2234,11 +2234,66 @@ static int
> vfio_iommu_type1_pasid_free(struct
> > > vfio_iommu *iommu,
> > >   return ret;
> > >  }
> > >
> > > +static int vfio_iommu_get_stage1_format(struct vfio_iommu *iommu,
> > > +  u32 *stage1_format)
> > > +{
> > > + struct vfio_domain *domain;
> > > + u32 format = 0, tmp_format = 0;
> > > + int ret;
> > > +
> > > + mutex_lock(>lock);
> > > + if (list_empty(>domain_list)) {
> > > + mutex_unlock(>lock);
> > > + return -EINVAL;
> > > + }
> > > +
> > > + list_for_each_entry(domain, >domain_list, next) {
> > > + if (iommu_domain_get_attr(domain->domain,
> > > + DOMAIN_ATTR_PASID_FORMAT, )) {
> > > + ret = -EINVAL;
> > > + format = 0;
> > > + goto out_unlock;
> > > + }
> > > + /*
> > > +  * format is always non-zero (the first format is
> > > +  * IOMMU_PASID_FORMAT_INTEL_VTD which is 1). For
> > > +  * the reason of potential different backed IOMMU
> > > +  * formats, here we expect to have identical formats
> > > +  * in the domain list, no mixed formats support.
> > > +  * return -EINVAL to fail the attempt of setup
> > > +  * VFIO_TYPE1_NESTING_IOMMU if non-identical formats
> > > +  * are detected.
> > > +  */
> > > + if (tmp_format && tmp_format != format) {
> > > + ret = -EINVAL;
> > > + format = 0;
> > > + goto out_unlock;
> > > + }
> > > +
> > > + tmp_format = format;
> > > + }
> >
> > this path is invoked only in VFIO_IOMMU_GET_INFO path. If we don't want
> to
> > assume the status quo that one container holds only one device w/
> vIOMMU
> > (the prerequisite for vSVA), looks we also need check the format
> > compatibility when attaching a new group to this container?
> 
> right. if attaching to a nesting type container (vfio_iommu.nesting bit
> indicates it), it should check if it is compabile with prior domains in
> the domain list. But if it is the first one attached to this container,
> it's fine. is it good?

yes, but my point is whether we should check the format compatibility
in the attach path...

> 
> > > + ret = 0;
> > > +
> > > +out_unlock:
> > > + if (format)
> > > + *stage1_format = format;
> > > + mutex_unlock(>lock);
> > > + return ret;
> > > +}
> > > +
> > >  static int vfio_iommu_info_add_nesting_cap(struct vfio_iommu *iommu,
> > >struct vfio_info_cap *caps)
> > >  {
> > >   struct vfio_info_cap_header *header;
> > >   struct vfio_iommu_type1_info_cap_nesting *nesting_cap;
> > > + u32 formats = 0;
> > > + int ret;
> > > +
> > > + ret = vfio_iommu_get_stage1_format(iommu, );
> > > + if (ret) {
> > > + pr_warn("Failed to get stage-1 format\n");
> > > + return ret;
> > > + }
> > >
> > >   header = vfio_info_cap_add(caps, sizeof(*nesting_cap),
> > >  

RE: [PATCH v1 8/8] vfio/type1: Add vSVA support for IOMMU-backed mdevs

2020-04-01 Thread Liu, Yi L
> From: Tian, Kevin 
> Sent: Monday, March 30, 2020 9:19 PM
> To: Liu, Yi L ; alex.william...@redhat.com;
> Subject: RE: [PATCH v1 8/8] vfio/type1: Add vSVA support for IOMMU-backed
> mdevs
> 
> > From: Liu, Yi L 
> > Sent: Sunday, March 22, 2020 8:32 PM
> >
> > From: Liu Yi L 
> >
> > Recent years, mediated device pass-through framework (e.g. vfio-mdev)
> > are used to achieve flexible device sharing across domains (e.g. VMs).
> 
> are->is

got it.

> > Also there are hardware assisted mediated pass-through solutions from
> > platform vendors. e.g. Intel VT-d scalable mode which supports Intel
> > Scalable I/O Virtualization technology. Such mdevs are called IOMMU-
> > backed mdevs as there are IOMMU enforced DMA isolation for such mdevs.
> > In kernel, IOMMU-backed mdevs are exposed to IOMMU layer by aux-
> > domain concept, which means mdevs are protected by an iommu domain
> > which is aux-domain of its physical device. Details can be found in
> > the KVM
> 
> "by an iommu domain which is auxiliary to the domain that the kernel driver
> primarily uses for DMA API"

yep.

> > presentation from Kevin Tian. IOMMU-backed equals to IOMMU-capable.
> >
> > https://events19.linuxfoundation.org/wp-content/uploads/2017/12/\
> > Hardware-Assisted-Mediated-Pass-Through-with-VFIO-Kevin-Tian-Intel.pdf
> >
> > This patch supports NESTING IOMMU for IOMMU-backed mdevs by figuring
> > out the physical device of an IOMMU-backed mdev and then invoking
> > IOMMU requests to IOMMU layer with the physical device and the mdev's
> > aux domain info.
> 
> "and then calling into the IOMMU layer to complete the vSVA operations on the 
> aux
> domain associated with that mdev"

got it.
> >
> > With this patch, vSVA (Virtual Shared Virtual Addressing) can be used
> > on IOMMU-backed mdevs.
> >
> > Cc: Kevin Tian 
> > CC: Jacob Pan 
> > CC: Jun Tian 
> > Cc: Alex Williamson 
> > Cc: Eric Auger 
> > Cc: Jean-Philippe Brucker 
> > Signed-off-by: Liu Yi L 
> > ---
> >  drivers/vfio/vfio_iommu_type1.c | 23 ---
> >  1 file changed, 20 insertions(+), 3 deletions(-)
> >
> > diff --git a/drivers/vfio/vfio_iommu_type1.c
> > b/drivers/vfio/vfio_iommu_type1.c index 937ec3f..d473665 100644
> > --- a/drivers/vfio/vfio_iommu_type1.c
> > +++ b/drivers/vfio/vfio_iommu_type1.c
> > @@ -132,6 +132,7 @@ struct vfio_regions {
> >
> >  struct domain_capsule {
> > struct iommu_domain *domain;
> > +   struct vfio_group *group;
> > void *data;
> >  };
> >
> > @@ -148,6 +149,7 @@ static int vfio_iommu_for_each_dev(struct
> > vfio_iommu *iommu,
> > list_for_each_entry(d, >domain_list, next) {
> > dc.domain = d->domain;
> > list_for_each_entry(g, >group_list, next) {
> > +   dc.group = g;
> > ret = iommu_group_for_each_dev(g->iommu_group,
> >, fn);
> > if (ret)
> > @@ -2347,7 +2349,12 @@ static int vfio_bind_gpasid_fn(struct device
> > *dev, void *data)
> > struct iommu_gpasid_bind_data *gbind_data =
> > (struct iommu_gpasid_bind_data *) dc->data;
> >
> > -   return iommu_sva_bind_gpasid(dc->domain, dev, gbind_data);
> > +   if (dc->group->mdev_group)
> > +   return iommu_sva_bind_gpasid(dc->domain,
> > +   vfio_mdev_get_iommu_device(dev), gbind_data);
> > +   else
> > +   return iommu_sva_bind_gpasid(dc->domain,
> > +   dev, gbind_data);
> >  }
> >
> >  static int vfio_unbind_gpasid_fn(struct device *dev, void *data) @@
> > -2356,8 +2363,13 @@ static int vfio_unbind_gpasid_fn(struct device
> > *dev, void *data)
> > struct iommu_gpasid_bind_data *gbind_data =
> > (struct iommu_gpasid_bind_data *) dc->data;
> >
> > -   return iommu_sva_unbind_gpasid(dc->domain, dev,
> > +   if (dc->group->mdev_group)
> > +   return iommu_sva_unbind_gpasid(dc->domain,
> > +   vfio_mdev_get_iommu_device(dev),
> > gbind_data->hpasid);
> > +   else
> > +   return iommu_sva_unbind_gpasid(dc->domain, dev,
> > +   gbind_data->hpasid);
> >  }
> >
> >  /**
> > @@ -2429,7 +2441,12 @@ static int vfio_cache_inv_fn(struct device
> > *dev, void *data)
> > struct iommu_cache_invalidate_info *cache_inv_info =
> > (struct iommu_cache_invalidate_info *) dc->data;
> >
> > -   return iommu_cache_invalidate(dc->domain, dev, cache_inv_info);
> > +   if (dc->group->mdev_group)
> > +   return iommu_cache_invalidate(dc->domain,
> > +   vfio_mdev_get_iommu_device(dev), cache_inv_info);
> > +   else
> > +   return iommu_cache_invalidate(dc->domain,
> > +   dev, cache_inv_info);
> >  }
> 
> possibly above could be simplified, e.g.
> 
> static struct device *vfio_get_iommu_device(struct vfio_group *group,
>   struct device *dev)
> 

RE: [PATCH v1 7/8] vfio/type1: Add VFIO_IOMMU_CACHE_INVALIDATE

2020-04-01 Thread Liu, Yi L
> From: Tian, Kevin 
> Sent: Monday, March 30, 2020 8:58 PM
> To: Liu, Yi L ; alex.william...@redhat.com;
> Subject: RE: [PATCH v1 7/8] vfio/type1: Add VFIO_IOMMU_CACHE_INVALIDATE
> 
> > From: Liu, Yi L 
> > Sent: Sunday, March 22, 2020 8:32 PM
> >
> > From: Liu Yi L 
> >
> > For VFIO IOMMUs with the type VFIO_TYPE1_NESTING_IOMMU, guest "owns"
> > the
> > first-level/stage-1 translation structures, the host IOMMU driver has
> > no knowledge of first-level/stage-1 structure cache updates unless the
> > guest invalidation requests are trapped and propagated to the host.
> >
> > This patch adds a new IOCTL VFIO_IOMMU_CACHE_INVALIDATE to propagate
> > guest
> > first-level/stage-1 IOMMU cache invalidations to host to ensure IOMMU
> > cache correctness.
> >
> > With this patch, vSVA (Virtual Shared Virtual Addressing) can be used
> > safely as the host IOMMU iotlb correctness are ensured.
> >
> > Cc: Kevin Tian 
> > CC: Jacob Pan 
> > Cc: Alex Williamson 
> > Cc: Eric Auger 
> > Cc: Jean-Philippe Brucker 
> > Signed-off-by: Liu Yi L 
> > Signed-off-by: Eric Auger 
> > Signed-off-by: Jacob Pan 
> > ---
> >  drivers/vfio/vfio_iommu_type1.c | 49
> > +
> >  include/uapi/linux/vfio.h   | 22 ++
> >  2 files changed, 71 insertions(+)
> >
> > diff --git a/drivers/vfio/vfio_iommu_type1.c
> > b/drivers/vfio/vfio_iommu_type1.c index a877747..937ec3f 100644
> > --- a/drivers/vfio/vfio_iommu_type1.c
> > +++ b/drivers/vfio/vfio_iommu_type1.c
> > @@ -2423,6 +2423,15 @@ static long
> > vfio_iommu_type1_unbind_gpasid(struct vfio_iommu *iommu,
> > return ret;
> >  }
> >
> > +static int vfio_cache_inv_fn(struct device *dev, void *data)
> 
> vfio_iommu_cache_inv_fn

got it.

> > +{
> > +   struct domain_capsule *dc = (struct domain_capsule *)data;
> > +   struct iommu_cache_invalidate_info *cache_inv_info =
> > +   (struct iommu_cache_invalidate_info *) dc->data;
> > +
> > +   return iommu_cache_invalidate(dc->domain, dev, cache_inv_info); }
> > +
> >  static long vfio_iommu_type1_ioctl(void *iommu_data,
> >unsigned int cmd, unsigned long arg)  { @@ -
> 2629,6 +2638,46 @@
> > static long vfio_iommu_type1_ioctl(void *iommu_data,
> > }
> > kfree(gbind_data);
> > return ret;
> > +   } else if (cmd == VFIO_IOMMU_CACHE_INVALIDATE) {
> > +   struct vfio_iommu_type1_cache_invalidate cache_inv;
> > +   u32 version;
> > +   int info_size;
> > +   void *cache_info;
> > +   int ret;
> > +
> > +   minsz = offsetofend(struct
> > vfio_iommu_type1_cache_invalidate,
> > +   flags);
> > +
> > +   if (copy_from_user(_inv, (void __user *)arg, minsz))
> > +   return -EFAULT;
> > +
> > +   if (cache_inv.argsz < minsz || cache_inv.flags)
> > +   return -EINVAL;
> > +
> > +   /* Get the version of struct iommu_cache_invalidate_info */
> > +   if (copy_from_user(,
> > +   (void __user *) (arg + minsz), sizeof(version)))
> > +   return -EFAULT;
> > +
> > +   info_size = iommu_uapi_get_data_size(
> > +   IOMMU_UAPI_CACHE_INVAL,
> > version);
> > +
> > +   cache_info = kzalloc(info_size, GFP_KERNEL);
> > +   if (!cache_info)
> > +   return -ENOMEM;
> > +
> > +   if (copy_from_user(cache_info,
> > +   (void __user *) (arg + minsz), info_size)) {
> > +   kfree(cache_info);
> > +   return -EFAULT;
> > +   }
> > +
> > +   mutex_lock(>lock);
> > +   ret = vfio_iommu_for_each_dev(iommu, vfio_cache_inv_fn,
> > +   cache_info);
> > +   mutex_unlock(>lock);
> > +   kfree(cache_info);
> > +   return ret;
> > }
> >
> > return -ENOTTY;
> > diff --git a/include/uapi/linux/vfio.h b/include/uapi/linux/vfio.h
> > index 2235bc6..62ca791 100644
> > --- a/include/uapi/linux/vfio.h
> > +++ b/include/uapi/linux/vfio.h
> > @@ -899,6 +899,28 @@ struct vfio_iommu_type1_bind {
> >   */
> >  #define VFIO_IOMMU_BIND_IO(VFIO_TYPE, VFIO_BASE + 23)
> >
> > +/**
> > + * VFIO_IOMMU_CACHE_INVALIDATE - _IOW(VFIO_TYPE, VFIO_BASE + 24,
> > + * struct vfio_iommu_type1_cache_invalidate)
> > + *
> > + * Propagate guest IOMMU cache invalidation to the host. The cache
> > + * invalidation information is conveyed by @cache_info, the content
> > + * format would be structures defined in uapi/linux/iommu.h. User
> > + * should be aware of that the struct  iommu_cache_invalidate_info
> > + * has a @version field, vfio needs to parse this field before
> > +getting
> > + * data from userspace.
> > + *
> > + * Availability of this IOCTL is after VFIO_SET_IOMMU.
> > + *
> > + * returns: 0 on success, -errno on failure.
> > + */
> > +struct 

RE: [PATCH v1 3/8] vfio/type1: Report PASID alloc/free support to userspace

2020-04-01 Thread Liu, Yi L
> From: Tian, Kevin 
> Sent: Monday, March 30, 2020 5:44 PM
> To: Liu, Yi L ; alex.william...@redhat.com;
> Subject: RE: [PATCH v1 3/8] vfio/type1: Report PASID alloc/free support to
> userspace
> 
> > From: Liu, Yi L 
> > Sent: Sunday, March 22, 2020 8:32 PM
> >
> > From: Liu Yi L 
> >
> > This patch reports PASID alloc/free availability to userspace (e.g.
> > QEMU) thus userspace could do a pre-check before utilizing this feature.
> >
> > Cc: Kevin Tian 
> > CC: Jacob Pan 
> > Cc: Alex Williamson 
> > Cc: Eric Auger 
> > Cc: Jean-Philippe Brucker 
> > Signed-off-by: Liu Yi L 
> > ---
> >  drivers/vfio/vfio_iommu_type1.c | 28 
> >  include/uapi/linux/vfio.h   |  8 
> >  2 files changed, 36 insertions(+)
> >
> > diff --git a/drivers/vfio/vfio_iommu_type1.c
> > b/drivers/vfio/vfio_iommu_type1.c index e40afc0..ddd1ffe 100644
> > --- a/drivers/vfio/vfio_iommu_type1.c
> > +++ b/drivers/vfio/vfio_iommu_type1.c
> > @@ -2234,6 +2234,30 @@ static int vfio_iommu_type1_pasid_free(struct
> > vfio_iommu *iommu,
> > return ret;
> >  }
> >
> > +static int vfio_iommu_info_add_nesting_cap(struct vfio_iommu *iommu,
> > +struct vfio_info_cap *caps)
> > +{
> > +   struct vfio_info_cap_header *header;
> > +   struct vfio_iommu_type1_info_cap_nesting *nesting_cap;
> > +
> > +   header = vfio_info_cap_add(caps, sizeof(*nesting_cap),
> > +  VFIO_IOMMU_TYPE1_INFO_CAP_NESTING,
> > 1);
> > +   if (IS_ERR(header))
> > +   return PTR_ERR(header);
> > +
> > +   nesting_cap = container_of(header,
> > +   struct vfio_iommu_type1_info_cap_nesting,
> > +   header);
> > +
> > +   nesting_cap->nesting_capabilities = 0;
> > +   if (iommu->nesting) {
> 
> Is it good to report a nesting cap when iommu->nesting is disabled? I suppose 
> the
> check should move before vfio_info_cap_add...

oops, yes it.

> 
> > +   /* nesting iommu type supports PASID requests (alloc/free)
> > */
> > +   nesting_cap->nesting_capabilities |=
> > VFIO_IOMMU_PASID_REQS;
> 
> VFIO_IOMMU_CAP_PASID_REQ? to avoid confusion with ioctl cmd
> VFIO_IOMMU_PASID_REQUEST...

got it.

Thanks,
Yi Liu

___
iommu mailing list
iommu@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/iommu


RE: [PATCH v1 5/8] vfio/type1: Report 1st-level/stage-1 format to userspace

2020-04-01 Thread Liu, Yi L
 > From: Tian, Kevin 
> Sent: Monday, March 30, 2020 7:49 PM
> To: Liu, Yi L ; alex.william...@redhat.com;
> Subject: RE: [PATCH v1 5/8] vfio/type1: Report 1st-level/stage-1 format to
> userspace
> 
> > From: Liu, Yi L 
> > Sent: Sunday, March 22, 2020 8:32 PM
> >
> > From: Liu Yi L 
> >
> > VFIO exposes IOMMU nesting translation (a.k.a dual stage translation)
> > capability to userspace. Thus applications like QEMU could support
> > vIOMMU with hardware's nesting translation capability for pass-through
> > devices. Before setting up nesting translation for pass-through
> > devices, QEMU and other applications need to learn the supported
> > 1st-lvl/stage-1 translation structure format like page table format.
> >
> > Take vSVA (virtual Shared Virtual Addressing) as an example, to
> > support vSVA for pass-through devices, QEMU setup nesting translation
> > for pass- through devices. The guest page table are configured to host
> > as 1st-lvl/
> > stage-1 page table. Therefore, guest format should be compatible with
> > host side.
> >
> > This patch reports the supported 1st-lvl/stage-1 page table format on
> > the current platform to userspace. QEMU and other alike applications
> > should use this format info when trying to setup IOMMU nesting
> > translation on host IOMMU.
> >
> > Cc: Kevin Tian 
> > CC: Jacob Pan 
> > Cc: Alex Williamson 
> > Cc: Eric Auger 
> > Cc: Jean-Philippe Brucker 
> > Signed-off-by: Liu Yi L 
> > ---
> >  drivers/vfio/vfio_iommu_type1.c | 56
> > +
> >  include/uapi/linux/vfio.h   |  1 +
> >  2 files changed, 57 insertions(+)
> >
> > diff --git a/drivers/vfio/vfio_iommu_type1.c
> > b/drivers/vfio/vfio_iommu_type1.c index 9aa2a67..82a9e0b 100644
> > --- a/drivers/vfio/vfio_iommu_type1.c
> > +++ b/drivers/vfio/vfio_iommu_type1.c
> > @@ -2234,11 +2234,66 @@ static int vfio_iommu_type1_pasid_free(struct
> > vfio_iommu *iommu,
> > return ret;
> >  }
> >
> > +static int vfio_iommu_get_stage1_format(struct vfio_iommu *iommu,
> > +u32 *stage1_format)
> > +{
> > +   struct vfio_domain *domain;
> > +   u32 format = 0, tmp_format = 0;
> > +   int ret;
> > +
> > +   mutex_lock(>lock);
> > +   if (list_empty(>domain_list)) {
> > +   mutex_unlock(>lock);
> > +   return -EINVAL;
> > +   }
> > +
> > +   list_for_each_entry(domain, >domain_list, next) {
> > +   if (iommu_domain_get_attr(domain->domain,
> > +   DOMAIN_ATTR_PASID_FORMAT, )) {
> > +   ret = -EINVAL;
> > +   format = 0;
> > +   goto out_unlock;
> > +   }
> > +   /*
> > +* format is always non-zero (the first format is
> > +* IOMMU_PASID_FORMAT_INTEL_VTD which is 1). For
> > +* the reason of potential different backed IOMMU
> > +* formats, here we expect to have identical formats
> > +* in the domain list, no mixed formats support.
> > +* return -EINVAL to fail the attempt of setup
> > +* VFIO_TYPE1_NESTING_IOMMU if non-identical formats
> > +* are detected.
> > +*/
> > +   if (tmp_format && tmp_format != format) {
> > +   ret = -EINVAL;
> > +   format = 0;
> > +   goto out_unlock;
> > +   }
> > +
> > +   tmp_format = format;
> > +   }
> 
> this path is invoked only in VFIO_IOMMU_GET_INFO path. If we don't want to
> assume the status quo that one container holds only one device w/ vIOMMU
> (the prerequisite for vSVA), looks we also need check the format
> compatibility when attaching a new group to this container?

right. if attaching to a nesting type container (vfio_iommu.nesting bit
indicates it), it should check if it is compabile with prior domains in
the domain list. But if it is the first one attached to this container,
it's fine. is it good?

> > +   ret = 0;
> > +
> > +out_unlock:
> > +   if (format)
> > +   *stage1_format = format;
> > +   mutex_unlock(>lock);
> > +   return ret;
> > +}
> > +
> >  static int vfio_iommu_info_add_nesting_cap(struct vfio_iommu *iommu,
> >  struct vfio_info_cap *caps)
> >  {
> > struct vfio_info_cap_header *header;
> > struct vfio_iommu_type1_info_cap_nesting *nesting_cap;
> > +   u32 formats = 0;
> > +   int ret;
> > +
> > +   ret = vfio_iommu_get_stage1_format(iommu, );
> > +   if (ret) {
> > +   pr_warn("Failed to get stage-1 format\n");
> > +   return ret;
> > +   }
> >
> > header = vfio_info_cap_add(caps, sizeof(*nesting_cap),
> >VFIO_IOMMU_TYPE1_INFO_CAP_NESTING,
> > 1);
> > @@ -2254,6 +2309,7 @@ static int
> > vfio_iommu_info_add_nesting_cap(struct
> > vfio_iommu *iommu,
> > /* nesting iommu type supports PASID requests (alloc/free) */
> > nesting_cap->nesting_capabilities |= VFIO_IOMMU_PASID_REQS;
> >  

Re: [PATCH V10 08/11] iommu/vt-d: Add svm/sva invalidate function

2020-04-01 Thread Auger Eric
Hi,

On 4/1/20 9:13 AM, Liu, Yi L wrote:
>> From: Tian, Kevin 
>> Sent: Wednesday, April 1, 2020 2:30 PM
>> To: Jacob Pan 
>> Subject: RE: [PATCH V10 08/11] iommu/vt-d: Add svm/sva invalidate function
>>
>>> From: Jacob Pan 
>>> Sent: Wednesday, April 1, 2020 4:58 AM
>>>
>>> On Tue, 31 Mar 2020 02:49:21 +
>>> "Tian, Kevin"  wrote:
>>>
> From: Auger Eric 
> Sent: Sunday, March 29, 2020 11:34 PM
>
> Hi,
>
> On 3/28/20 11:01 AM, Tian, Kevin wrote:
>>> From: Jacob Pan 
>>> Sent: Saturday, March 21, 2020 7:28 AM
>>>
>>> When Shared Virtual Address (SVA) is enabled for a guest OS via
>>> vIOMMU, we need to provide invalidation support at IOMMU API
>>> and
> driver
>>> level. This patch adds Intel VT-d specific function to
>>> implement iommu passdown invalidate API for shared virtual address.
>>>
>>> The use case is for supporting caching structure invalidation
>>> of assigned SVM capable devices. Emulated IOMMU exposes queue
  [...]
  [...]
> to
>>> + * VT-d granularity. Invalidation is typically included in the
>>> unmap
> operation
>>> + * as a result of DMA or VFIO unmap. However, for assigned
>>> devices
> guest
>>> + * owns the first level page tables. Invalidations of
>>> translation caches in
> the
  [...]
  [...]
  [...]
>
>>> inv_type_granu_map[IOMMU_CACHE_INV_TYPE_NR][IOMMU_INV_GRANU_
>>> NR] = {
>>> +   /*
>>> +* PASID based IOTLB invalidation: PASID selective (per
>>> PASID),
>>> +* page selective (address granularity)
>>> +*/
>>> +   {0, 1, 1},
>>> +   /* PASID based dev TLBs, only support all PASIDs or
>>> single PASID */
>>> +   {1, 1, 0},
>>
>> Is this combination correct? when single PASID is being
>> specified, it is essentially a page-selective invalidation since
>> you need provide Address and Size.
> Isn't it the same when G=1? Still the addr/size is used. Doesn't
> it

 I thought addr/size is not used when G=1, but it might be wrong. I'm
 checking with our vt-d spec owner.

>>>
> correspond to IOMMU_INV_GRANU_ADDR with
>> IOMMU_INV_ADDR_FLAGS_PASID
> flag unset?
>
> so {0, 0, 1}?

>>> I am not sure I got your logic. The three fields correspond to
>>> IOMMU_INV_GRANU_DOMAIN, /* domain-selective
>>> invalidation */
>>> IOMMU_INV_GRANU_PASID,  /* PASID-selective invalidation */
>>> IOMMU_INV_GRANU_ADDR,   /* page-selective invalidation *
>>>
>>> For devTLB, we use domain as global since there is no domain. Then I
>>> came up with {1, 1, 0}, which means we could have global and pasid
>>> granu invalidation for PASID based devTLB.
>>>
>>> If the caller also provide addr and S bit, the flush routine will put
>>
>> "also" -> "must", because vt-d requires addr/size must be provided in
>> devtlb
>> descriptor, that is why Eric suggests {0, 0, 1}.
> 
> I think it should be {0, 0, 1} :-) addr field and S field are must, pasid
> field depends on G bit.

On my side, I understood from the spec that addr/S are always used
whatever the granularity, hence the above suggestion.

As a comparison, for PASID based IOTLB invalidation, it is clearly
stated that if G matches PASID selective invalidation, address field is
ignored. This is not written that way for PASID-based device TLB inv.
> 
> I didn’t read through all comments. Here is a concern with this 2-D table,
> the iommu cache type is defined as below. I suppose there is a problem here.
> If I'm using IOMMU_CACHE_INV_TYPE_PASID, it will beyond the 2-D table.
> 
> /* IOMMU paging structure cache */
> #define IOMMU_CACHE_INV_TYPE_IOTLB  (1 << 0) /* IOMMU IOTLB */
> #define IOMMU_CACHE_INV_TYPE_DEV_IOTLB  (1 << 1) /* Device IOTLB */
> #define IOMMU_CACHE_INV_TYPE_PASID  (1 << 2) /* PASID cache */
> #define IOMMU_CACHE_INV_TYPE_NR (3)
oups indeed

Thanks

Eric
> 
>>>
 I have one more open:

 How does userspace know which invalidation type/gran is supported?
 I didn't see such capability reporting in Yi's VFIO vSVA patch set.
 Do we want the user/kernel assume the same capability set if they
 are architectural? However the kernel could also do some
 optimization e.g. hide devtlb invalidation capability given that the
 kernel already invalidate devtlb automatically when serving iotlb
 invalidation...

>>> In general, we are trending to use VFIO capability chain to expose
>>> iommu capabilities.
>>>
>>> But for architectural features such as type/granu, we have to assume
>>> the same capability between host & guest. Granu and types are not
>>> enumerated on the host IOMMU either.
>>>
>>> For devTLB optimization, I agree we need to expose a capability to the
>>> guest stating that implicit devtlb invalidation is supported.
>>> Otherwise, if Linux guest runs on other OSes may not support implicit
>>> devtlb invalidation.

RE: [PATCH V10 08/11] iommu/vt-d: Add svm/sva invalidate function

2020-04-01 Thread Liu, Yi L
> From: Tian, Kevin 
> Sent: Wednesday, April 1, 2020 2:30 PM
> To: Jacob Pan 
> Subject: RE: [PATCH V10 08/11] iommu/vt-d: Add svm/sva invalidate function
> 
> > From: Jacob Pan 
> > Sent: Wednesday, April 1, 2020 4:58 AM
> >
> > On Tue, 31 Mar 2020 02:49:21 +
> > "Tian, Kevin"  wrote:
> >
> > > > From: Auger Eric 
> > > > Sent: Sunday, March 29, 2020 11:34 PM
> > > >
> > > > Hi,
> > > >
> > > > On 3/28/20 11:01 AM, Tian, Kevin wrote:
> > > > >> From: Jacob Pan 
> > > > >> Sent: Saturday, March 21, 2020 7:28 AM
> > > > >>
> > > > >> When Shared Virtual Address (SVA) is enabled for a guest OS via
> > > > >> vIOMMU, we need to provide invalidation support at IOMMU API
> > > > >> and
> > > > driver
> > > > >> level. This patch adds Intel VT-d specific function to
> > > > >> implement iommu passdown invalidate API for shared virtual address.
> > > > >>
> > > > >> The use case is for supporting caching structure invalidation
> > > > >> of assigned SVM capable devices. Emulated IOMMU exposes queue
> > >  [...]
> > >  [...]
> > > > to
> > > > >> + * VT-d granularity. Invalidation is typically included in the
> > > > >> unmap
> > > > operation
> > > > >> + * as a result of DMA or VFIO unmap. However, for assigned
> > > > >> devices
> > > > guest
> > > > >> + * owns the first level page tables. Invalidations of
> > > > >> translation caches in
> > > > the
> > >  [...]
> > >  [...]
> > >  [...]
> > > >
> > inv_type_granu_map[IOMMU_CACHE_INV_TYPE_NR][IOMMU_INV_GRANU_
> > > > >> NR] = {
> > > > >> +/*
> > > > >> + * PASID based IOTLB invalidation: PASID selective (per
> > > > >> PASID),
> > > > >> + * page selective (address granularity)
> > > > >> + */
> > > > >> +{0, 1, 1},
> > > > >> +/* PASID based dev TLBs, only support all PASIDs or
> > > > >> single PASID */
> > > > >> +{1, 1, 0},
> > > > >
> > > > > Is this combination correct? when single PASID is being
> > > > > specified, it is essentially a page-selective invalidation since
> > > > > you need provide Address and Size.
> > > > Isn't it the same when G=1? Still the addr/size is used. Doesn't
> > > > it
> > >
> > > I thought addr/size is not used when G=1, but it might be wrong. I'm
> > > checking with our vt-d spec owner.
> > >
> >
> > > > correspond to IOMMU_INV_GRANU_ADDR with
> IOMMU_INV_ADDR_FLAGS_PASID
> > > > flag unset?
> > > >
> > > > so {0, 0, 1}?
> > >
> > I am not sure I got your logic. The three fields correspond to
> > IOMMU_INV_GRANU_DOMAIN, /* domain-selective
> > invalidation */
> > IOMMU_INV_GRANU_PASID,  /* PASID-selective invalidation */
> > IOMMU_INV_GRANU_ADDR,   /* page-selective invalidation *
> >
> > For devTLB, we use domain as global since there is no domain. Then I
> > came up with {1, 1, 0}, which means we could have global and pasid
> > granu invalidation for PASID based devTLB.
> >
> > If the caller also provide addr and S bit, the flush routine will put
> 
> "also" -> "must", because vt-d requires addr/size must be provided in
> devtlb
> descriptor, that is why Eric suggests {0, 0, 1}.

I think it should be {0, 0, 1} :-) addr field and S field are must, pasid
field depends on G bit.

I didn’t read through all comments. Here is a concern with this 2-D table,
the iommu cache type is defined as below. I suppose there is a problem here.
If I'm using IOMMU_CACHE_INV_TYPE_PASID, it will beyond the 2-D table.

/* IOMMU paging structure cache */
#define IOMMU_CACHE_INV_TYPE_IOTLB  (1 << 0) /* IOMMU IOTLB */
#define IOMMU_CACHE_INV_TYPE_DEV_IOTLB  (1 << 1) /* Device IOTLB */
#define IOMMU_CACHE_INV_TYPE_PASID  (1 << 2) /* PASID cache */
#define IOMMU_CACHE_INV_TYPE_NR (3)

> >
> > > I have one more open:
> > >
> > > How does userspace know which invalidation type/gran is supported?
> > > I didn't see such capability reporting in Yi's VFIO vSVA patch set.
> > > Do we want the user/kernel assume the same capability set if they
> > > are architectural? However the kernel could also do some
> > > optimization e.g. hide devtlb invalidation capability given that the
> > > kernel already invalidate devtlb automatically when serving iotlb
> > > invalidation...
> > >
> > In general, we are trending to use VFIO capability chain to expose
> > iommu capabilities.
> >
> > But for architectural features such as type/granu, we have to assume
> > the same capability between host & guest. Granu and types are not
> > enumerated on the host IOMMU either.
> >
> > For devTLB optimization, I agree we need to expose a capability to the
> > guest stating that implicit devtlb invalidation is supported.
> > Otherwise, if Linux guest runs on other OSes may not support implicit
> > devtlb invalidation.
> >
> > Right Yi?
> 
> Thanks for explanation. So we are assumed to support all operations
> defined in spec, so no need to expose them one-by-one. For optimization,
> I'm fine to do it later.

yes. :-)

Regards,
Yi Liu

___
iommu mailing list

RE: [PATCH V10 08/11] iommu/vt-d: Add svm/sva invalidate function

2020-04-01 Thread Liu, Yi L
> From: Tian, Kevin 
> Sent: Wednesday, April 1, 2020 2:24 PM
> To: Jacob Pan 
> Subject: RE: [PATCH V10 08/11] iommu/vt-d: Add svm/sva invalidate function
> 
> > From: Jacob Pan 
> > Sent: Wednesday, April 1, 2020 2:14 AM
> >
> > On Sat, 28 Mar 2020 10:01:42 +
> > "Tian, Kevin"  wrote:
> >
> > > > From: Jacob Pan 
> > > > Sent: Saturday, March 21, 2020 7:28 AM
> > > >
> > > > When Shared Virtual Address (SVA) is enabled for a guest OS via
> > > > vIOMMU, we need to provide invalidation support at IOMMU API and
> > > > driver level. This patch adds Intel VT-d specific function to
> > > > implement iommu passdown invalidate API for shared virtual address.
> > > >
> > > > The use case is for supporting caching structure invalidation
> > > > of assigned SVM capable devices. Emulated IOMMU exposes queue
> > >
> > > emulated IOMMU -> vIOMMU, since virito-iommu could use the
> > > interface as well.
> > >
> > True, but it does not invalidate this statement about emulated IOMMU. I
> > will add another statement saying "the same interface can be used for
> > virtio-IOMMU as well". OK?
> 
> sure
> 
> >
> > > > invalidation capability and passes down all descriptors from the
> > > > guest to the physical IOMMU.
> > > >
> > > > The assumption is that guest to host device ID mapping should be
> > > > resolved prior to calling IOMMU driver. Based on the device handle,
> > > > host IOMMU driver can replace certain fields before submit to the
> > > > invalidation queue.
> > > >
> > > > ---
> > > > v7 review fixed in v10
> > > > ---
> > > >
> > > > Signed-off-by: Jacob Pan 
> > > > Signed-off-by: Ashok Raj 
> > > > Signed-off-by: Liu, Yi L 
> > > > ---
> > > >  drivers/iommu/intel-iommu.c | 182
> > > > 
> > > >  1 file changed, 182 insertions(+)
> > > >
> > > > diff --git a/drivers/iommu/intel-iommu.c
> > > > b/drivers/iommu/intel-iommu.c index b1477cd423dd..a76afb0fd51a
> > > > 100644 --- a/drivers/iommu/intel-iommu.c
> > > > +++ b/drivers/iommu/intel-iommu.c
> > > > @@ -5619,6 +5619,187 @@ static void
> > > > intel_iommu_aux_detach_device(struct iommu_domain *domain,
> > > > aux_domain_remove_dev(to_dmar_domain(domain), dev);
> > > >  }
> > > >
> > > > +/*
> > > > + * 2D array for converting and sanitizing IOMMU generic TLB
> > > > granularity to
> > > > + * VT-d granularity. Invalidation is typically included in the
> > > > unmap operation
> > > > + * as a result of DMA or VFIO unmap. However, for assigned devices
> > > > guest
> > > > + * owns the first level page tables. Invalidations of translation
> > > > caches in the
> > > > + * guest are trapped and passed down to the host.
> > > > + *
> > > > + * vIOMMU in the guest will only expose first level page tables,
> > > > therefore
> > > > + * we do not include IOTLB granularity for request without PASID
> > > > (second level).
> > >
> > > I would revise above as "We do not support IOTLB granularity for
> > > request without PASID (second level), therefore any vIOMMU
> > > implementation that exposes the SVA capability to the guest should
> > > only expose the first level page tables, implying all invalidation
> > > requests from the guest will include a valid PASID"
> > >
> > Sounds good.
> >
> > > > + *
> > > > + * For example, to find the VT-d granularity encoding for IOTLB
> > > > + * type and page selective granularity within PASID:
> > > > + * X: indexed by iommu cache type
> > > > + * Y: indexed by enum iommu_inv_granularity
> > > > + * [IOMMU_CACHE_INV_TYPE_IOTLB][IOMMU_INV_GRANU_ADDR]
> > > > + *
> > > > + * Granu_map array indicates validity of the table. 1: valid, 0:
> > > > invalid
> > > > + *
> > > > + */
> > > > +const static int
> > > >
> > inv_type_granu_map[IOMMU_CACHE_INV_TYPE_NR][IOMMU_INV_GRANU_
> > > > NR] = {
> > > > +   /*
> > > > +* PASID based IOTLB invalidation: PASID selective (per
> > > > PASID),
> > > > +* page selective (address granularity)
> > > > +*/
> > > > +   {0, 1, 1},
> > > > +   /* PASID based dev TLBs, only support all PASIDs or single
> > > > PASID */
> > > > +   {1, 1, 0},
> > >
> > > Is this combination correct? when single PASID is being specified, it
> > > is essentially a page-selective invalidation since you need provide
> > > Address and Size.
> > >
> > This is for translation between generic UAPI granu to VT-d granu, it
> > has nothing to do with address and size.
> 
> Generic UAPI defines three granularities: domain, pasid and addr.
> from the definition domain applies all entries related to did, pasid
> applies to all entries related to pasid, while addr is specific for a
> range.
> 
> from what we just confirmed internally with VT-d spec owner, our
> PASID based dev TLB invalidation always requires addr and size,
> while current uAPI doesn't support multiple PASIDs based range
> invaliation. It sounds to me that you want to use domain to replace
> multiple PASIDs case (G=1), but it then changes the meaning of
> the domain 

RE: [PATCH V10 08/11] iommu/vt-d: Add svm/sva invalidate function

2020-04-01 Thread Tian, Kevin
> From: Jacob Pan 
> Sent: Wednesday, April 1, 2020 5:08 AM
> 
> On Tue, 31 Mar 2020 03:34:22 +
> "Tian, Kevin"  wrote:
> 
> > > From: Auger Eric 
> > > Sent: Monday, March 30, 2020 12:05 AM
> > >
> > > On 3/28/20 11:01 AM, Tian, Kevin wrote:
> > > >> From: Jacob Pan 
> > > >> Sent: Saturday, March 21, 2020 7:28 AM
> > > >>
> > > >> When Shared Virtual Address (SVA) is enabled for a guest OS via
> > > >> vIOMMU, we need to provide invalidation support at IOMMU API
> > > >> and
> > > driver
> > > >> level. This patch adds Intel VT-d specific function to implement
> > > >> iommu passdown invalidate API for shared virtual address.
> > > >>
> > > >> The use case is for supporting caching structure invalidation
> > > >> of assigned SVM capable devices. Emulated IOMMU exposes queue
> > > >
> > > > emulated IOMMU -> vIOMMU, since virito-iommu could use the
> > > > interface as well.
> > > >
> > > >> invalidation capability and passes down all descriptors from the
> > > >> guest to the physical IOMMU.
> > > >>
> > > >> The assumption is that guest to host device ID mapping should be
> > > >> resolved prior to calling IOMMU driver. Based on the device
> > > >> handle, host IOMMU driver can replace certain fields before
> > > >> submit to the invalidation queue.
> > > >>
> > > >> ---
> > > >> v7 review fixed in v10
> > > >> ---
> > > >>
> > > >> Signed-off-by: Jacob Pan 
> > > >> Signed-off-by: Ashok Raj 
> > > >> Signed-off-by: Liu, Yi L 
> > > >> ---
> > > >>  drivers/iommu/intel-iommu.c | 182
> > > >> 
> > > >>  1 file changed, 182 insertions(+)
> > > >>
> > > >> diff --git a/drivers/iommu/intel-iommu.c
> > > >> b/drivers/iommu/intel-iommu.c index b1477cd423dd..a76afb0fd51a
> > > >> 100644 --- a/drivers/iommu/intel-iommu.c
> > > >> +++ b/drivers/iommu/intel-iommu.c
> > > >> @@ -5619,6 +5619,187 @@ static void
> > > >> intel_iommu_aux_detach_device(struct iommu_domain *domain,
> > > >>aux_domain_remove_dev(to_dmar_domain(domain), dev);
> > > >>  }
> > > >>
> > > >> +/*
> > > >> + * 2D array for converting and sanitizing IOMMU generic TLB
> > > >> granularity
> > > to
> > > >> + * VT-d granularity. Invalidation is typically included in the
> > > >> unmap
> > > operation
> > > >> + * as a result of DMA or VFIO unmap. However, for assigned
> > > >> devices
> > > guest
> > > >> + * owns the first level page tables. Invalidations of
> > > >> translation caches in
> > > the
> > > >> + * guest are trapped and passed down to the host.
> > > >> + *
> > > >> + * vIOMMU in the guest will only expose first level page
> > > >> tables, therefore
> > > >> + * we do not include IOTLB granularity for request without
> > > >> PASID (second level).
> > > >
> > > > I would revise above as "We do not support IOTLB granularity for
> > > > request without PASID (second level), therefore any vIOMMU
> > > > implementation that exposes the SVA capability to the guest
> > > > should only expose the first level page tables, implying all
> > > > invalidation requests from the guest will include a valid PASID"
> > > >
> > > >> + *
> > > >> + * For example, to find the VT-d granularity encoding for IOTLB
> > > >> + * type and page selective granularity within PASID:
> > > >> + * X: indexed by iommu cache type
> > > >> + * Y: indexed by enum iommu_inv_granularity
> > > >> + * [IOMMU_CACHE_INV_TYPE_IOTLB][IOMMU_INV_GRANU_ADDR]
> > > >> + *
> > > >> + * Granu_map array indicates validity of the table. 1: valid,
> > > >> 0: invalid
> > > >> + *
> > > >> + */
> > > >> +const static int
> > > >>
> > >
> inv_type_granu_map[IOMMU_CACHE_INV_TYPE_NR][IOMMU_INV_GRANU_
> > > >> NR] = {
> > > >> +  /*
> > > >> +   * PASID based IOTLB invalidation: PASID selective (per
> > > >> PASID),
> > > >> +   * page selective (address granularity)
> > > >> +   */
> > > >> +  {0, 1, 1},
> > > >> +  /* PASID based dev TLBs, only support all PASIDs or
> > > >> single PASID */
> > > >> +  {1, 1, 0},
> > > >
> > > > Is this combination correct? when single PASID is being
> > > > specified, it is essentially a page-selective invalidation since
> > > > you need provide Address and Size.
> > > >
> > > >> +  /* PASID cache */
> > > >
> > > > PASID cache is fully managed by the host. Guest PASID cache
> > > > invalidation is interpreted by vIOMMU for bind and unbind
> > > > operations. I don't think we should accept any PASID cache
> > > > invalidation from userspace or guest.
> > > I tend to agree here.
> > > >
> > > >> +  {1, 1, 0}
> > > >> +};
> > > >> +
> > > >> +const static int
> > > >>
> > >
> inv_type_granu_table[IOMMU_CACHE_INV_TYPE_NR][IOMMU_INV_GRANU
> > > >> _NR] = {
> > > >> +  /* PASID based IOTLB */
> > > >> +  {0, QI_GRAN_NONG_PASID, QI_GRAN_PSI_PASID},
> > > >> +  /* PASID based dev TLBs */
> > > >> +  {QI_DEV_IOTLB_GRAN_ALL, QI_DEV_IOTLB_GRAN_PASID_SEL, 0},
> > > >> +  /* PASID cache */
> > > >> +  {QI_PC_ALL_PASIDS, QI_PC_PASID_SEL, 0},
> > > >> 

RE: [PATCH V10 08/11] iommu/vt-d: Add svm/sva invalidate function

2020-04-01 Thread Tian, Kevin
> From: Jacob Pan 
> Sent: Wednesday, April 1, 2020 4:58 AM
> 
> On Tue, 31 Mar 2020 02:49:21 +
> "Tian, Kevin"  wrote:
> 
> > > From: Auger Eric 
> > > Sent: Sunday, March 29, 2020 11:34 PM
> > >
> > > Hi,
> > >
> > > On 3/28/20 11:01 AM, Tian, Kevin wrote:
> > > >> From: Jacob Pan 
> > > >> Sent: Saturday, March 21, 2020 7:28 AM
> > > >>
> > > >> When Shared Virtual Address (SVA) is enabled for a guest OS via
> > > >> vIOMMU, we need to provide invalidation support at IOMMU API
> > > >> and
> > > driver
> > > >> level. This patch adds Intel VT-d specific function to implement
> > > >> iommu passdown invalidate API for shared virtual address.
> > > >>
> > > >> The use case is for supporting caching structure invalidation
> > > >> of assigned SVM capable devices. Emulated IOMMU exposes queue
> >  [...]
> >  [...]
> > > to
> > > >> + * VT-d granularity. Invalidation is typically included in the
> > > >> unmap
> > > operation
> > > >> + * as a result of DMA or VFIO unmap. However, for assigned
> > > >> devices
> > > guest
> > > >> + * owns the first level page tables. Invalidations of
> > > >> translation caches in
> > > the
> >  [...]
> >  [...]
> >  [...]
> > >
> inv_type_granu_map[IOMMU_CACHE_INV_TYPE_NR][IOMMU_INV_GRANU_
> > > >> NR] = {
> > > >> +  /*
> > > >> +   * PASID based IOTLB invalidation: PASID selective (per
> > > >> PASID),
> > > >> +   * page selective (address granularity)
> > > >> +   */
> > > >> +  {0, 1, 1},
> > > >> +  /* PASID based dev TLBs, only support all PASIDs or
> > > >> single PASID */
> > > >> +  {1, 1, 0},
> > > >
> > > > Is this combination correct? when single PASID is being
> > > > specified, it is essentially a page-selective invalidation since
> > > > you need provide Address and Size.
> > > Isn't it the same when G=1? Still the addr/size is used. Doesn't
> > > it
> >
> > I thought addr/size is not used when G=1, but it might be wrong. I'm
> > checking with our vt-d spec owner.
> >
> 
> > > correspond to IOMMU_INV_GRANU_ADDR with
> > > IOMMU_INV_ADDR_FLAGS_PASID flag
> > > unset?
> > >
> > > so {0, 0, 1}?
> >
> I am not sure I got your logic. The three fields correspond to
>   IOMMU_INV_GRANU_DOMAIN, /* domain-selective
> invalidation */
>   IOMMU_INV_GRANU_PASID,  /* PASID-selective invalidation */
>   IOMMU_INV_GRANU_ADDR,   /* page-selective invalidation *
> 
> For devTLB, we use domain as global since there is no domain. Then I
> came up with {1, 1, 0}, which means we could have global and pasid
> granu invalidation for PASID based devTLB.
> 
> If the caller also provide addr and S bit, the flush routine will put

"also" -> "must", because vt-d requires addr/size must be provided
in devtlb descriptor, that is why Eric suggests {0, 0, 1}.

> that into QI descriptor. I know this is a little odd, but from the
> granu translation p.o.v. VT-d spec has no G bit for page selective
> invalidation.

We don't need such odd way if can do it properly. 

> 
> > I have one more open:
> >
> > How does userspace know which invalidation type/gran is supported?
> > I didn't see such capability reporting in Yi's VFIO vSVA patch set.
> > Do we want the user/kernel assume the same capability set if they are
> > architectural? However the kernel could also do some optimization
> > e.g. hide devtlb invalidation capability given that the kernel
> > already invalidate devtlb automatically when serving iotlb
> > invalidation...
> >
> In general, we are trending to use VFIO capability chain to expose iommu
> capabilities.
> 
> But for architectural features such as type/granu, we have to assume
> the same capability between host & guest. Granu and types are not
> enumerated on the host IOMMU either.
> 
> For devTLB optimization, I agree we need to expose a capability to
> the guest stating that implicit devtlb invalidation is supported.
> Otherwise, if Linux guest runs on other OSes may not support implicit
> devtlb invalidation.
> 
> Right Yi?

Thanks for explanation. So we are assumed to support all operations
defined in spec, so no need to expose them one-by-one. For
optimization, I'm fine to do it later. 

> 
> > Thanks
> > Kevin
> >
> > >
> > > Thanks
> > >
> > > Eric
> > >
> > > >
> > > >> +  /* PASID cache */
> > > >
> > > > PASID cache is fully managed by the host. Guest PASID cache
> > > > invalidation is interpreted by vIOMMU for bind and unbind
> > > > operations. I don't think we should accept any PASID cache
> > > > invalidation from userspace or guest.
> >  [...]
> > >
> inv_type_granu_table[IOMMU_CACHE_INV_TYPE_NR][IOMMU_INV_GRANU
> >  [...]
> > > >
> > > > btw do we really need both map and table here? Can't we just
> > > > use one table with unsupported granularity marked as a special
> > > > value?
> > > >
> >  [...]
> > > >
> > > > -ENOTSUPP?
> > > >
> >  [...]
> > > >
> > > > granularity == IOMMU_INV_GRANU_ADDR? otherwise it's unclear
> > > > why IOMMU_INV_GRANU_DOMAIN also needs size check.
> > > >
> >  [...]
> > > >>> 

RE: [PATCH V10 08/11] iommu/vt-d: Add svm/sva invalidate function

2020-04-01 Thread Tian, Kevin
> From: Jacob Pan 
> Sent: Wednesday, April 1, 2020 2:14 AM
> 
> On Sat, 28 Mar 2020 10:01:42 +
> "Tian, Kevin"  wrote:
> 
> > > From: Jacob Pan 
> > > Sent: Saturday, March 21, 2020 7:28 AM
> > >
> > > When Shared Virtual Address (SVA) is enabled for a guest OS via
> > > vIOMMU, we need to provide invalidation support at IOMMU API and
> > > driver level. This patch adds Intel VT-d specific function to
> > > implement iommu passdown invalidate API for shared virtual address.
> > >
> > > The use case is for supporting caching structure invalidation
> > > of assigned SVM capable devices. Emulated IOMMU exposes queue
> >
> > emulated IOMMU -> vIOMMU, since virito-iommu could use the
> > interface as well.
> >
> True, but it does not invalidate this statement about emulated IOMMU. I
> will add another statement saying "the same interface can be used for
> virtio-IOMMU as well". OK?

sure

> 
> > > invalidation capability and passes down all descriptors from the
> > > guest to the physical IOMMU.
> > >
> > > The assumption is that guest to host device ID mapping should be
> > > resolved prior to calling IOMMU driver. Based on the device handle,
> > > host IOMMU driver can replace certain fields before submit to the
> > > invalidation queue.
> > >
> > > ---
> > > v7 review fixed in v10
> > > ---
> > >
> > > Signed-off-by: Jacob Pan 
> > > Signed-off-by: Ashok Raj 
> > > Signed-off-by: Liu, Yi L 
> > > ---
> > >  drivers/iommu/intel-iommu.c | 182
> > > 
> > >  1 file changed, 182 insertions(+)
> > >
> > > diff --git a/drivers/iommu/intel-iommu.c
> > > b/drivers/iommu/intel-iommu.c index b1477cd423dd..a76afb0fd51a
> > > 100644 --- a/drivers/iommu/intel-iommu.c
> > > +++ b/drivers/iommu/intel-iommu.c
> > > @@ -5619,6 +5619,187 @@ static void
> > > intel_iommu_aux_detach_device(struct iommu_domain *domain,
> > >   aux_domain_remove_dev(to_dmar_domain(domain), dev);
> > >  }
> > >
> > > +/*
> > > + * 2D array for converting and sanitizing IOMMU generic TLB
> > > granularity to
> > > + * VT-d granularity. Invalidation is typically included in the
> > > unmap operation
> > > + * as a result of DMA or VFIO unmap. However, for assigned devices
> > > guest
> > > + * owns the first level page tables. Invalidations of translation
> > > caches in the
> > > + * guest are trapped and passed down to the host.
> > > + *
> > > + * vIOMMU in the guest will only expose first level page tables,
> > > therefore
> > > + * we do not include IOTLB granularity for request without PASID
> > > (second level).
> >
> > I would revise above as "We do not support IOTLB granularity for
> > request without PASID (second level), therefore any vIOMMU
> > implementation that exposes the SVA capability to the guest should
> > only expose the first level page tables, implying all invalidation
> > requests from the guest will include a valid PASID"
> >
> Sounds good.
> 
> > > + *
> > > + * For example, to find the VT-d granularity encoding for IOTLB
> > > + * type and page selective granularity within PASID:
> > > + * X: indexed by iommu cache type
> > > + * Y: indexed by enum iommu_inv_granularity
> > > + * [IOMMU_CACHE_INV_TYPE_IOTLB][IOMMU_INV_GRANU_ADDR]
> > > + *
> > > + * Granu_map array indicates validity of the table. 1: valid, 0:
> > > invalid
> > > + *
> > > + */
> > > +const static int
> > >
> inv_type_granu_map[IOMMU_CACHE_INV_TYPE_NR][IOMMU_INV_GRANU_
> > > NR] = {
> > > + /*
> > > +  * PASID based IOTLB invalidation: PASID selective (per
> > > PASID),
> > > +  * page selective (address granularity)
> > > +  */
> > > + {0, 1, 1},
> > > + /* PASID based dev TLBs, only support all PASIDs or single
> > > PASID */
> > > + {1, 1, 0},
> >
> > Is this combination correct? when single PASID is being specified, it
> > is essentially a page-selective invalidation since you need provide
> > Address and Size.
> >
> This is for translation between generic UAPI granu to VT-d granu, it
> has nothing to do with address and size.

Generic UAPI defines three granularities: domain, pasid and addr.
from the definition domain applies all entries related to did, pasid
applies to all entries related to pasid, while addr is specific for a
range.

from what we just confirmed internally with VT-d spec owner, our
PASID based dev TLB invalidation always requires addr and size, 
while current uAPI doesn't support multiple PASIDs based range
invaliation. It sounds to me that you want to use domain to replace 
multiple PASIDs case (G=1), but it then changes the meaning of 
the domain granularity and easily lead to confusion.

I feel Eric's proposal makes more sense. Here we'd better use {0, 0, 1}
to indicate only addr range invalidation is allowed, matching the
spec definition. We may use a special flag in iommu_inv_addr_info
to indicate G=1 case, if necessary.

> e.g.
> If user passes IOMMU_INV_GRANU_PASID for the single PASID case as you
> mentioned, this map table shows it is valid.
> 
> Then the lookup result will