Re: [PATCH v3 1/2] vfio/type1: Simplify bus_type determination

2022-06-30 Thread Alex Williamson
On Fri, 24 Jun 2022 18:51:44 +0100
Robin Murphy  wrote:

> Since IOMMU groups are mandatory for drivers to support, it stands to
> reason that any device which has been successfully added to a group
> must be on a bus supported by that IOMMU driver, and therefore a domain
> viable for any device in the group must be viable for all devices in
> the group. This already has to be the case for the IOMMU API's internal
> default domain, for instance. Thus even if the group contains devices on
> different buses, that can only mean that the IOMMU driver actually
> supports such an odd topology, and so without loss of generality we can
> expect the bus type of any device in a group to be suitable for IOMMU
> API calls.
> 
> Furthermore, scrutiny reveals a lack of protection for the bus being
> removed while vfio_iommu_type1_attach_group() is using it; the reference
> that VFIO holds on the iommu_group ensures that data remains valid, but
> does not prevent the group's membership changing underfoot.
> 
> We can address both concerns by recycling vfio_bus_type() into some
> superficially similar logic to indirect the IOMMU API calls themselves.
> Each call is thus protected from races by the IOMMU group's own locking,
> and we no longer need to hold group-derived pointers beyond that scope.
> It also gives us an easy path for the IOMMU API's migration of bus-based
> interfaces to device-based, of which we can already take the first step
> with device_iommu_capable(). As with domains, any capability must in
> practice be consistent for devices in a given group - and after all it's
> still the same capability which was expected to be consistent across an
> entire bus! - so there's no need for any complicated validation.
> 
> Signed-off-by: Robin Murphy 
> ---

Applied series to vfio next branch for v5.20.  Thanks,

Alex


> v3: Complete rewrite yet again, and finally it doesn't feel like we're
> stretching any abstraction boundaries the wrong way, and the diffstat
> looks right too. I did think about embedding IOMMU_CAP_INTR_REMAP
> directly in the callback, but decided I like the consistency of minimal
> generic wrappers. And yes, if the capability isn't supported then it
> does end up getting tested for the whole group, but meh, it's harmless.
> 
>  drivers/vfio/vfio_iommu_type1.c | 42 +
>  1 file changed, 22 insertions(+), 20 deletions(-)
> 
> diff --git a/drivers/vfio/vfio_iommu_type1.c b/drivers/vfio/vfio_iommu_type1.c
> index c13b9290e357..a77ff00c677b 100644
> --- a/drivers/vfio/vfio_iommu_type1.c
> +++ b/drivers/vfio/vfio_iommu_type1.c
> @@ -1679,18 +1679,6 @@ static int vfio_dma_do_map(struct vfio_iommu *iommu,
>   return ret;
>  }
>  
> -static int vfio_bus_type(struct device *dev, void *data)
> -{
> - struct bus_type **bus = data;
> -
> - if (*bus && *bus != dev->bus)
> - return -EINVAL;
> -
> - *bus = dev->bus;
> -
> - return 0;
> -}
> -
>  static int vfio_iommu_replay(struct vfio_iommu *iommu,
>struct vfio_domain *domain)
>  {
> @@ -2153,13 +2141,25 @@ static void vfio_iommu_iova_insert_copy(struct 
> vfio_iommu *iommu,
>   list_splice_tail(iova_copy, iova);
>  }
>  
> +static int vfio_iommu_device_capable(struct device *dev, void *data)
> +{
> + return device_iommu_capable(dev, (enum iommu_cap)data);
> +}
> +
> +static int vfio_iommu_domain_alloc(struct device *dev, void *data)
> +{
> + struct iommu_domain **domain = data;
> +
> + *domain = iommu_domain_alloc(dev->bus);
> + return 1; /* Don't iterate */
> +}
> +
>  static int vfio_iommu_type1_attach_group(void *iommu_data,
>   struct iommu_group *iommu_group, enum vfio_group_type type)
>  {
>   struct vfio_iommu *iommu = iommu_data;
>   struct vfio_iommu_group *group;
>   struct vfio_domain *domain, *d;
> - struct bus_type *bus = NULL;
>   bool resv_msi, msi_remap;
>   phys_addr_t resv_msi_base = 0;
>   struct iommu_domain_geometry *geo;
> @@ -2192,18 +2192,19 @@ static int vfio_iommu_type1_attach_group(void 
> *iommu_data,
>   goto out_unlock;
>   }
>  
> - /* Determine bus_type in order to allocate a domain */
> - ret = iommu_group_for_each_dev(iommu_group, &bus, vfio_bus_type);
> - if (ret)
> - goto out_free_group;
> -
>   ret = -ENOMEM;
>   domain = kzalloc(sizeof(*domain), GFP_KERNEL);
>   if (!domain)
>   goto out_free_group;
>  
> + /*
> +  * Going via the iommu_group iterator avoids races, and trivially gives
> +  * us a representative device for the IOMMU API call. We don't actually
> +  * want to iterate beyond the first device (if any).
> +  */
>   ret = -EIO;
> - domain->domain = iommu_domain_alloc(bus);
> + iommu_group_for_each_dev(iommu_group, &domain->domain,
> +  vfio_iommu_domain_alloc);
>   if (!domain->domain)
>   goto out_free_domain;
>  
> @@ -225

Re: [PATCH v3 1/2] vfio/type1: Simplify bus_type determination

2022-06-27 Thread Alex Williamson
On Fri, 24 Jun 2022 18:51:44 +0100
Robin Murphy  wrote:

> Since IOMMU groups are mandatory for drivers to support, it stands to
> reason that any device which has been successfully added to a group
> must be on a bus supported by that IOMMU driver, and therefore a domain
> viable for any device in the group must be viable for all devices in
> the group. This already has to be the case for the IOMMU API's internal
> default domain, for instance. Thus even if the group contains devices on
> different buses, that can only mean that the IOMMU driver actually
> supports such an odd topology, and so without loss of generality we can
> expect the bus type of any device in a group to be suitable for IOMMU
> API calls.
> 
> Furthermore, scrutiny reveals a lack of protection for the bus being
> removed while vfio_iommu_type1_attach_group() is using it; the reference
> that VFIO holds on the iommu_group ensures that data remains valid, but
> does not prevent the group's membership changing underfoot.
> 
> We can address both concerns by recycling vfio_bus_type() into some
> superficially similar logic to indirect the IOMMU API calls themselves.
> Each call is thus protected from races by the IOMMU group's own locking,
> and we no longer need to hold group-derived pointers beyond that scope.
> It also gives us an easy path for the IOMMU API's migration of bus-based
> interfaces to device-based, of which we can already take the first step
> with device_iommu_capable(). As with domains, any capability must in
> practice be consistent for devices in a given group - and after all it's
> still the same capability which was expected to be consistent across an
> entire bus! - so there's no need for any complicated validation.
> 
> Signed-off-by: Robin Murphy 
> ---
> 
> v3: Complete rewrite yet again, and finally it doesn't feel like we're
> stretching any abstraction boundaries the wrong way, and the diffstat
> looks right too. I did think about embedding IOMMU_CAP_INTR_REMAP
> directly in the callback, but decided I like the consistency of minimal
> generic wrappers. And yes, if the capability isn't supported then it
> does end up getting tested for the whole group, but meh, it's harmless.
> 
>  drivers/vfio/vfio_iommu_type1.c | 42 +
>  1 file changed, 22 insertions(+), 20 deletions(-)
> 
> diff --git a/drivers/vfio/vfio_iommu_type1.c b/drivers/vfio/vfio_iommu_type1.c
> index c13b9290e357..a77ff00c677b 100644
> --- a/drivers/vfio/vfio_iommu_type1.c
> +++ b/drivers/vfio/vfio_iommu_type1.c
> @@ -1679,18 +1679,6 @@ static int vfio_dma_do_map(struct vfio_iommu *iommu,
>   return ret;
>  }
>  
> -static int vfio_bus_type(struct device *dev, void *data)
> -{
> - struct bus_type **bus = data;
> -
> - if (*bus && *bus != dev->bus)
> - return -EINVAL;
> -
> - *bus = dev->bus;
> -
> - return 0;
> -}
> -
>  static int vfio_iommu_replay(struct vfio_iommu *iommu,
>struct vfio_domain *domain)
>  {
> @@ -2153,13 +2141,25 @@ static void vfio_iommu_iova_insert_copy(struct 
> vfio_iommu *iommu,
>   list_splice_tail(iova_copy, iova);
>  }
>  

Any objection if I add the following comment:

/* Redundantly walks non-present capabilities to simplify caller */

Thanks,
Alex

> +static int vfio_iommu_device_capable(struct device *dev, void *data)
> +{
> + return device_iommu_capable(dev, (enum iommu_cap)data);
> +}
> +
> +static int vfio_iommu_domain_alloc(struct device *dev, void *data)
> +{
> + struct iommu_domain **domain = data;
> +
> + *domain = iommu_domain_alloc(dev->bus);
> + return 1; /* Don't iterate */
> +}
> +
>  static int vfio_iommu_type1_attach_group(void *iommu_data,
>   struct iommu_group *iommu_group, enum vfio_group_type type)
>  {
>   struct vfio_iommu *iommu = iommu_data;
>   struct vfio_iommu_group *group;
>   struct vfio_domain *domain, *d;
> - struct bus_type *bus = NULL;
>   bool resv_msi, msi_remap;
>   phys_addr_t resv_msi_base = 0;
>   struct iommu_domain_geometry *geo;
> @@ -2192,18 +2192,19 @@ static int vfio_iommu_type1_attach_group(void 
> *iommu_data,
>   goto out_unlock;
>   }
>  
> - /* Determine bus_type in order to allocate a domain */
> - ret = iommu_group_for_each_dev(iommu_group, &bus, vfio_bus_type);
> - if (ret)
> - goto out_free_group;
> -
>   ret = -ENOMEM;
>   domain = kzalloc(sizeof(*domain), GFP_KERNEL);
>   if (!domain)
>   goto out_free_group;
>  
> + /*
> +  * Going via the iommu_group iterator avoids races, and trivially gives
> +  * us a representative device for the IOMMU API call. We don't actually
> +  * want to iterate beyond the first device (if any).
> +  */
>   ret = -EIO;
> - domain->domain = iommu_domain_alloc(bus);
> + iommu_group_for_each_dev(iommu_group, &domain->domain,
> +  vfio_iommu_domain_alloc);
>   if

Re: [PATCH v2 1/2] vfio/type1: Simplify bus_type determination

2022-06-24 Thread Alex Williamson
On Fri, 24 Jun 2022 16:12:55 +0100
Robin Murphy  wrote:

> On 2022-06-24 15:28, Alex Williamson wrote:
> > On Fri, 24 Jun 2022 11:18:36 -0300
> > Jason Gunthorpe  wrote:
> >   
> >> On Fri, Jun 24, 2022 at 08:11:59AM -0600, Alex Williamson wrote:  
> >>> On Thu, 23 Jun 2022 22:50:30 -0300
> >>> Jason Gunthorpe  wrote:
> >>>  
> >>>> On Thu, Jun 23, 2022 at 05:00:44PM -0600, Alex Williamson wrote:
> >>>>  
> >>>>>>>> +struct vfio_device *vfio_device_get_from_iommu(struct iommu_group 
> >>>>>>>> *iommu_group)
> >>>>>>>> +{
> >>>>>>>> +struct vfio_group *group = 
> >>>>>>>> vfio_group_get_from_iommu(iommu_group);
> >>>>>>>> +struct vfio_device *device;  
> >>>>>>>
> >>>>>>> Check group for NULL.  
> >>>>>>
> >>>>>> OK - FWIW in context this should only ever make sense to call with an
> >>>>>> iommu_group which has already been derived from a vfio_group, and I did
> >>>>>> initially consider a check with a WARN_ON(), but then decided that the
> >>>>>> unguarded dereference would be a sufficiently strong message. No 
> >>>>>> problem
> >>>>>> with bringing that back to make it more defensive if that's what you 
> >>>>>> prefer.  
> >>>>>
> >>>>> A while down the road, that's a bit too much implicit knowledge of the
> >>>>> intent and single purpose of this function just to simply avoid a test. 
> >>>>>  
> >>>>
> >>>> I think we should just pass the 'struct vfio_group *' into the
> >>>> attach_group op and have this API take that type in and forget the
> >>>> vfio_group_get_from_iommu().  
> >>>
> >>> That's essentially what I'm suggesting, the vfio_group is passed as an
> >>> opaque pointer which type1 can use for a
> >>> vfio_group_for_each_vfio_device() type call.  Thanks,  
> >>
> >> I don't want to add a whole vfio_group_for_each_vfio_device()
> >> machinery that isn't actually needed by anything.. This is all
> >> internal, we don't need to design more than exactly what is needed.
> >>
> >> At this point if we change the signature of the attach then we may as
> >> well just pass in the representative vfio_device, that is probably
> >> less LOC overall.  
> > 
> > That means that vfio core still needs to pick an arbitrary
> > representative device, which I find in fundamental conflict to the
> > nature of groups.  Type1 is the interface to the IOMMU API, if through
> > the IOMMU API we can make an assumption that all devices within the
> > group are equivalent for a given operation, that should be done in type1
> > code, not in vfio core.  A for-each interface is commonplace and not
> > significantly more code or design than already proposed.  Thanks,  
> 
> It also occurred to me this morning that there's another middle-ground 
> option staring out from the call-wrapping notion I mentioned yesterday - 
> while I'm not keen to provide it from the IOMMU API, there's absolutely 
> no reason that VFIO couldn't just use the building blocks by itself, and 
> in fact it works out almost absurdly simple:
> 
> static bool vfio_device_capable(struct device *dev, void *data)
> {
>   return device_iommu_capable(dev, (enum iommu_cap)data);
> }
> 
> bool vfio_group_capable(struct iommu_group *group, enum iommu_cap cap)
> {
>   return iommu_group_for_each_dev(group, (void *)cap, 
> vfio_device_capable);
> }
> 
> and much the same for iommu_domain_alloc() once I get that far. The 
> locking concern neatly disappears because we're no longer holding any 
> bus or device pointer that can go stale. How does that seem as a 
> compromise for now, looking forward to Jason's longer-term view of 
> rearranging the attach_group process such that a vfio_device falls 
> naturally to hand?

Yup, that seems like another way to do it, a slight iteration on the
current bus_type flow, and also avoids any sort of arbitrary
representative device being passed around as an API.

For clarity of the principle that all devices within the group should
have the same capabilities, we could even further follow the existing
bus_type and do a sanity test here at the same time, or perhaps simply
stop after the first device to avoid the if-any-device-is-capable
semantics implied above.  Thanks,

Alex

___
iommu mailing list
iommu@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/iommu


Re: [PATCH v2 1/2] vfio/type1: Simplify bus_type determination

2022-06-24 Thread Alex Williamson
On Fri, 24 Jun 2022 11:18:36 -0300
Jason Gunthorpe  wrote:

> On Fri, Jun 24, 2022 at 08:11:59AM -0600, Alex Williamson wrote:
> > On Thu, 23 Jun 2022 22:50:30 -0300
> > Jason Gunthorpe  wrote:
> >   
> > > On Thu, Jun 23, 2022 at 05:00:44PM -0600, Alex Williamson wrote:
> > >   
> > > > > >> +struct vfio_device *vfio_device_get_from_iommu(struct iommu_group 
> > > > > >> *iommu_group)
> > > > > >> +{
> > > > > >> +  struct vfio_group *group = 
> > > > > >> vfio_group_get_from_iommu(iommu_group);
> > > > > >> +  struct vfio_device *device;  
> > > > > > 
> > > > > > Check group for NULL.  
> > > > > 
> > > > > OK - FWIW in context this should only ever make sense to call with an 
> > > > > iommu_group which has already been derived from a vfio_group, and I 
> > > > > did 
> > > > > initially consider a check with a WARN_ON(), but then decided that 
> > > > > the 
> > > > > unguarded dereference would be a sufficiently strong message. No 
> > > > > problem 
> > > > > with bringing that back to make it more defensive if that's what you 
> > > > > prefer.
> > > > 
> > > > A while down the road, that's a bit too much implicit knowledge of the
> > > > intent and single purpose of this function just to simply avoid a test. 
> > > >
> > > 
> > > I think we should just pass the 'struct vfio_group *' into the
> > > attach_group op and have this API take that type in and forget the
> > > vfio_group_get_from_iommu().  
> > 
> > That's essentially what I'm suggesting, the vfio_group is passed as an
> > opaque pointer which type1 can use for a
> > vfio_group_for_each_vfio_device() type call.  Thanks,  
> 
> I don't want to add a whole vfio_group_for_each_vfio_device()
> machinery that isn't actually needed by anything.. This is all
> internal, we don't need to design more than exactly what is needed.
> 
> At this point if we change the signature of the attach then we may as
> well just pass in the representative vfio_device, that is probably
> less LOC overall.

That means that vfio core still needs to pick an arbitrary
representative device, which I find in fundamental conflict to the
nature of groups.  Type1 is the interface to the IOMMU API, if through
the IOMMU API we can make an assumption that all devices within the
group are equivalent for a given operation, that should be done in type1
code, not in vfio core.  A for-each interface is commonplace and not
significantly more code or design than already proposed.  Thanks,

Alex

___
iommu mailing list
iommu@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/iommu


Re: [PATCH v2 1/2] vfio/type1: Simplify bus_type determination

2022-06-24 Thread Alex Williamson
On Thu, 23 Jun 2022 22:50:30 -0300
Jason Gunthorpe  wrote:

> On Thu, Jun 23, 2022 at 05:00:44PM -0600, Alex Williamson wrote:
> 
> > > >> +struct vfio_device *vfio_device_get_from_iommu(struct iommu_group 
> > > >> *iommu_group)
> > > >> +{
> > > >> +  struct vfio_group *group = 
> > > >> vfio_group_get_from_iommu(iommu_group);
> > > >> +  struct vfio_device *device;
> > > > 
> > > > Check group for NULL.
> > > 
> > > OK - FWIW in context this should only ever make sense to call with an 
> > > iommu_group which has already been derived from a vfio_group, and I did 
> > > initially consider a check with a WARN_ON(), but then decided that the 
> > > unguarded dereference would be a sufficiently strong message. No problem 
> > > with bringing that back to make it more defensive if that's what you 
> > > prefer.  
> > 
> > A while down the road, that's a bit too much implicit knowledge of the
> > intent and single purpose of this function just to simply avoid a test.  
> 
> I think we should just pass the 'struct vfio_group *' into the
> attach_group op and have this API take that type in and forget the
> vfio_group_get_from_iommu().

That's essentially what I'm suggesting, the vfio_group is passed as an
opaque pointer which type1 can use for a
vfio_group_for_each_vfio_device() type call.  Thanks,

Alex

___
iommu mailing list
iommu@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/iommu


Re: [PATCH v2 1/2] vfio/type1: Simplify bus_type determination

2022-06-23 Thread Alex Williamson
On Thu, 23 Jun 2022 13:23:05 +0100
Robin Murphy  wrote:

> On 2022-06-22 23:17, Alex Williamson wrote:
> > On Wed, 22 Jun 2022 13:04:11 +0100
> > Robin Murphy  wrote:
> >   
> >> Since IOMMU groups are mandatory for drivers to support, it stands to
> >> reason that any device which has been successfully be added to a group  
> > 
> > s/be //  
> 
> Oops.
> 
> >> must be on a bus supported by that IOMMU driver, and therefore a domain
> >> viable for any device in the group must be viable for all devices in
> >> the group. This already has to be the case for the IOMMU API's internal
> >> default domain, for instance. Thus even if the group contains devices on
> >> different buses, that can only mean that the IOMMU driver actually
> >> supports such an odd topology, and so without loss of generality we can
> >> expect the bus type of any device in a group to be suitable for IOMMU
> >> API calls.
> >>
> >> Replace vfio_bus_type() with a simple call to resolve an appropriate
> >> member device from which to then derive a bus type. This is also a step
> >> towards removing the vague bus-based interfaces from the IOMMU API, when
> >> we can subsequently switch to using this device directly.
> >>
> >> Furthermore, scrutiny reveals a lack of protection for the bus being
> >> removed while vfio_iommu_type1_attach_group() is using it; the reference
> >> that VFIO holds on the iommu_group ensures that data remains valid, but
> >> does not prevent the group's membership changing underfoot. Holding the
> >> vfio_device for as long as we need here also neatly solves this.
> >>
> >> Signed-off-by: Robin Murphy 
> >> ---
> >>
> >> After sleeping on it, I decided to type up the helper function approach
> >> to see how it looked in practice, and in doing so realised that with one
> >> more tweak it could also subsume the locking out of the common paths as
> >> well, so end up being a self-contained way for type1 to take care of its
> >> own concern, which I rather like.
> >>
> >>   drivers/vfio/vfio.c | 18 +-
> >>   drivers/vfio/vfio.h |  3 +++
> >>   drivers/vfio/vfio_iommu_type1.c | 30 +++---
> >>   3 files changed, 31 insertions(+), 20 deletions(-)
> >>
> >> diff --git a/drivers/vfio/vfio.c b/drivers/vfio/vfio.c
> >> index 61e71c1154be..73bab04880d0 100644
> >> --- a/drivers/vfio/vfio.c
> >> +++ b/drivers/vfio/vfio.c
> >> @@ -448,7 +448,7 @@ static void vfio_group_get(struct vfio_group *group)
> >>* Device objects - create, release, get, put, search
> >>*/
> >>   /* Device reference always implies a group reference */
> >> -static void vfio_device_put(struct vfio_device *device)
> >> +void vfio_device_put(struct vfio_device *device)
> >>   {
> >>if (refcount_dec_and_test(&device->refcount))
> >>complete(&device->comp);
> >> @@ -475,6 +475,22 @@ static struct vfio_device 
> >> *vfio_group_get_device(struct vfio_group *group,
> >>return NULL;
> >>   }
> >>   
> >> +struct vfio_device *vfio_device_get_from_iommu(struct iommu_group 
> >> *iommu_group)
> >> +{
> >> +  struct vfio_group *group = vfio_group_get_from_iommu(iommu_group);
> >> +  struct vfio_device *device;  
> > 
> > Check group for NULL.  
> 
> OK - FWIW in context this should only ever make sense to call with an 
> iommu_group which has already been derived from a vfio_group, and I did 
> initially consider a check with a WARN_ON(), but then decided that the 
> unguarded dereference would be a sufficiently strong message. No problem 
> with bringing that back to make it more defensive if that's what you prefer.

A while down the road, that's a bit too much implicit knowledge of the
intent and single purpose of this function just to simply avoid a test.

> >> +
> >> +  mutex_lock(&group->device_lock);
> >> +  list_for_each_entry(device, &group->device_list, group_next) {
> >> +  if (vfio_device_try_get(device)) {
> >> +  mutex_unlock(&group->device_lock);
> >> +  return device;
> >> +  }
> >> +  }
> >> +  mutex_unlock(&group->device_lock);
> >> +  return NULL;  
> > 
> > No vfio_group_put() on either path.  
> 
> Oops indeed.
> 
> >> +}
> >&g

Re: [PATCH v2 1/2] vfio/type1: Simplify bus_type determination

2022-06-23 Thread Alex Williamson
On Thu, 23 Jun 2022 08:46:45 +
"Tian, Kevin"  wrote:

> > From: Alex Williamson 
> > Sent: Thursday, June 23, 2022 6:17 AM
> >   
> > >
> > >   ret = -EIO;
> > > - domain->domain = iommu_domain_alloc(bus);
> > > + domain->domain = iommu_domain_alloc(iommu_api_dev->dev-
> > >bus);  
> > 
> > It makes sense to move away from a bus centric interface to iommu ops
> > and I can see that having a device interface when we have device level
> > address-ability within a group makes sense, but does it make sense to
> > only have that device level interface?  For example, if an iommu_group
> > is going to remain an aspect of the iommu subsystem, shouldn't we be
> > able to allocate a domain and test capabilities based on the group and
> > the iommu driver should have enough embedded information reachable
> > from
> > the struct iommu_group to do those things?  This "perform group level
> > operations based on an arbitrary device in the group" is pretty klunky.
> > Thanks,
> >   
> 
> This sounds a right thing to do.
> 
> btw another alternative which I'm thinking of is whether vfio_group
> can record the bus info when the first device is added to it in
> __vfio_register_dev(). Then we don't need a group interface from
> iommu to test if vfio is the only user having such requirement.

That might be more simple, but it's just another variation on vfio
picking an arbitrary device from a group to satisfy the iommu interface
rather than operating on an iommu subsystem provided object.  Thanks,

Alex

___
iommu mailing list
iommu@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/iommu


Re: [PATCH v2 1/2] vfio/type1: Simplify bus_type determination

2022-06-22 Thread Alex Williamson
On Wed, 22 Jun 2022 13:04:11 +0100
Robin Murphy  wrote:

> Since IOMMU groups are mandatory for drivers to support, it stands to
> reason that any device which has been successfully be added to a group

s/be //

> must be on a bus supported by that IOMMU driver, and therefore a domain
> viable for any device in the group must be viable for all devices in
> the group. This already has to be the case for the IOMMU API's internal
> default domain, for instance. Thus even if the group contains devices on
> different buses, that can only mean that the IOMMU driver actually
> supports such an odd topology, and so without loss of generality we can
> expect the bus type of any device in a group to be suitable for IOMMU
> API calls.
> 
> Replace vfio_bus_type() with a simple call to resolve an appropriate
> member device from which to then derive a bus type. This is also a step
> towards removing the vague bus-based interfaces from the IOMMU API, when
> we can subsequently switch to using this device directly.
> 
> Furthermore, scrutiny reveals a lack of protection for the bus being
> removed while vfio_iommu_type1_attach_group() is using it; the reference
> that VFIO holds on the iommu_group ensures that data remains valid, but
> does not prevent the group's membership changing underfoot. Holding the
> vfio_device for as long as we need here also neatly solves this.
> 
> Signed-off-by: Robin Murphy 
> ---
> 
> After sleeping on it, I decided to type up the helper function approach
> to see how it looked in practice, and in doing so realised that with one
> more tweak it could also subsume the locking out of the common paths as
> well, so end up being a self-contained way for type1 to take care of its
> own concern, which I rather like.
> 
>  drivers/vfio/vfio.c | 18 +-
>  drivers/vfio/vfio.h |  3 +++
>  drivers/vfio/vfio_iommu_type1.c | 30 +++---
>  3 files changed, 31 insertions(+), 20 deletions(-)
> 
> diff --git a/drivers/vfio/vfio.c b/drivers/vfio/vfio.c
> index 61e71c1154be..73bab04880d0 100644
> --- a/drivers/vfio/vfio.c
> +++ b/drivers/vfio/vfio.c
> @@ -448,7 +448,7 @@ static void vfio_group_get(struct vfio_group *group)
>   * Device objects - create, release, get, put, search
>   */
>  /* Device reference always implies a group reference */
> -static void vfio_device_put(struct vfio_device *device)
> +void vfio_device_put(struct vfio_device *device)
>  {
>   if (refcount_dec_and_test(&device->refcount))
>   complete(&device->comp);
> @@ -475,6 +475,22 @@ static struct vfio_device *vfio_group_get_device(struct 
> vfio_group *group,
>   return NULL;
>  }
>  
> +struct vfio_device *vfio_device_get_from_iommu(struct iommu_group 
> *iommu_group)
> +{
> + struct vfio_group *group = vfio_group_get_from_iommu(iommu_group);
> + struct vfio_device *device;

Check group for NULL.

> +
> + mutex_lock(&group->device_lock);
> + list_for_each_entry(device, &group->device_list, group_next) {
> + if (vfio_device_try_get(device)) {
> + mutex_unlock(&group->device_lock);
> + return device;
> + }
> + }
> + mutex_unlock(&group->device_lock);
> + return NULL;

No vfio_group_put() on either path.

> +}
> +
>  /*
>   * VFIO driver API
>   */
> diff --git a/drivers/vfio/vfio.h b/drivers/vfio/vfio.h
> index a67130221151..e8f21e64541b 100644
> --- a/drivers/vfio/vfio.h
> +++ b/drivers/vfio/vfio.h
> @@ -70,3 +70,6 @@ struct vfio_iommu_driver_ops {
>  
>  int vfio_register_iommu_driver(const struct vfio_iommu_driver_ops *ops);
>  void vfio_unregister_iommu_driver(const struct vfio_iommu_driver_ops *ops);
> +
> +struct vfio_device *vfio_device_get_from_iommu(struct iommu_group 
> *iommu_group);
> +void vfio_device_put(struct vfio_device *device);
> diff --git a/drivers/vfio/vfio_iommu_type1.c b/drivers/vfio/vfio_iommu_type1.c
> index c13b9290e357..e38b8bfde677 100644
> --- a/drivers/vfio/vfio_iommu_type1.c
> +++ b/drivers/vfio/vfio_iommu_type1.c
> @@ -1679,18 +1679,6 @@ static int vfio_dma_do_map(struct vfio_iommu *iommu,
>   return ret;
>  }
>  
> -static int vfio_bus_type(struct device *dev, void *data)
> -{
> - struct bus_type **bus = data;
> -
> - if (*bus && *bus != dev->bus)
> - return -EINVAL;
> -
> - *bus = dev->bus;
> -
> - return 0;
> -}
> -
>  static int vfio_iommu_replay(struct vfio_iommu *iommu,
>struct vfio_domain *domain)
>  {
> @@ -2159,7 +2147,7 @@ static int vfio_iommu_type1_attach_group(void 
> *iommu_data,
>   struct vfio_iommu *iommu = iommu_data;
>   struct vfio_iommu_group *group;
>   struct vfio_domain *domain, *d;
> - struct bus_type *bus = NULL;
> + struct vfio_device *iommu_api_dev;
>   bool resv_msi, msi_remap;
>   phys_addr_t resv_msi_base = 0;
>   struct iommu_domain_geometry *geo;
> @@ -2192,18 +2180,19 @@ static int vfio_iommu_type1_attach_g

Re: [PATCH v2 2/5] vfio/iommu_type1: Prefer to reuse domains vs match enforced cache coherency

2022-06-21 Thread Alex Williamson
On Wed, 15 Jun 2022 17:03:01 -0700
Nicolin Chen  wrote:

> From: Jason Gunthorpe 
> 
> The KVM mechanism for controlling wbinvd is based on OR of the coherency
> property of all devices attached to a guest, no matter those devices are
> attached to a single domain or multiple domains.
> 
> So, there is no value in trying to push a device that could do enforced
> cache coherency to a dedicated domain vs re-using an existing domain
> which is non-coherent since KVM won't be able to take advantage of it.
> This just wastes domain memory.
> 
> Simplify this code and eliminate the test. This removes the only logic
> that needed to have a dummy domain attached prior to searching for a
> matching domain and simplifies the next patches.
> 
> It's unclear whether we want to further optimize the Intel driver to
> update the domain coherency after a device is detached from it, at
> least not before KVM can be verified to handle such dynamics in related
> emulation paths (wbinvd, vcpu load, write_cr0, ept, etc.). In reality
> we don't see an usage requiring such optimization as the only device
> which imposes such non-coherency is Intel GPU which even doesn't
> support hotplug/hot remove.

The 2nd paragraph above is quite misleading in this respect.  I think
it would be more accurate to explain that the benefit to using separate
domains was that devices attached to domains supporting enforced cache
coherency always mapped with the attributes necessary to provide that
feature, therefore if a non-enforced domain was dropped, the associated
group removal would re-trigger an evaluation by KVM.  We can then go on
to discuss that in practice the only known cases of such mixed domains
included an Intel IGD device behind an IOMMU lacking snoop control,
where such devices do not support hotplug, therefore this scenario lacks
testing and is not considered sufficiently relevant to support.  Thanks,

Alex

> 
> Signed-off-by: Jason Gunthorpe 
> Signed-off-by: Nicolin Chen 
> ---
>  drivers/vfio/vfio_iommu_type1.c | 4 +---
>  1 file changed, 1 insertion(+), 3 deletions(-)
> 
> diff --git a/drivers/vfio/vfio_iommu_type1.c b/drivers/vfio/vfio_iommu_type1.c
> index c13b9290e357..f4e3b423a453 100644
> --- a/drivers/vfio/vfio_iommu_type1.c
> +++ b/drivers/vfio/vfio_iommu_type1.c
> @@ -2285,9 +2285,7 @@ static int vfio_iommu_type1_attach_group(void 
> *iommu_data,
>* testing if they're on the same bus_type.
>*/
>   list_for_each_entry(d, &iommu->domain_list, next) {
> - if (d->domain->ops == domain->domain->ops &&
> - d->enforce_cache_coherency ==
> - domain->enforce_cache_coherency) {
> + if (d->domain->ops == domain->domain->ops) {
>   iommu_detach_group(domain->domain, group->iommu_group);
>   if (!iommu_attach_group(d->domain,
>   group->iommu_group)) {

___
iommu mailing list
iommu@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/iommu


Re: [PATCH] vfio: Do not manipulate iommu dma_owner for fake iommu groups

2022-05-24 Thread Alex Williamson
On Thu, 19 May 2022 14:03:48 -0300
Jason Gunthorpe  wrote:

> Since asserting dma ownership now causes the group to have its DMA blocked
> the iommu layer requires a working iommu. This means the dma_owner APIs
> cannot be used on the fake groups that VFIO creates. Test for this and
> avoid calling them.
> 
> Otherwise asserting dma ownership will fail for VFIO mdev devices as a
> BLOCKING iommu_domain cannot be allocated due to the NULL iommu ops.
> 
> Fixes: 0286300e6045 ("iommu: iommu_group_claim_dma_owner() must always assign 
> a domain")
> Reported-by: Eric Farman 
> Tested-by: Eric Farman 
> Signed-off-by: Jason Gunthorpe 
> ---
>  drivers/vfio/vfio.c | 15 ++-
>  1 file changed, 10 insertions(+), 5 deletions(-)

Applied to vfio next branch for v5.19.  Thanks,

Alex

___
iommu mailing list
iommu@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/iommu


Re: [PATCH v3] iommu: iommu_group_claim_dma_owner() must always assign a domain

2022-05-19 Thread Alex Williamson
On Thu, 19 May 2022 09:32:05 +0200
Joerg Roedel  wrote:

> On Wed, May 18, 2022 at 04:14:46PM -0300, Jason Gunthorpe wrote:
> > Fixes: 0286300e6045 ("iommu: iommu_group_claim_dma_owner() must always 
> > assign a domain")
> > Reported-by: Eric Farman 
> > Signed-off-by: Jason Gunthorpe 
> > ---
> >  drivers/vfio/vfio.c | 15 ++-
> >  1 file changed, 10 insertions(+), 5 deletions(-)  
> 
> That fix will be taken care of by Alex? Or does it need to go through
> the IOMMU tree?

The comment suggested my tree.  Jason, were you planning to post this
on its own instead of buried in a reply or shall we take it from here?
Thanks,

Alex

___
iommu mailing list
iommu@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/iommu


Re: [PATCH] vfio: Remove VFIO_TYPE1_NESTING_IOMMU

2022-05-17 Thread Alex Williamson
On Tue, 10 May 2022 13:55:24 -0300
Jason Gunthorpe  wrote:

> This control causes the ARM SMMU drivers to choose a stage 2
> implementation for the IO pagetable (vs the stage 1 usual default),
> however this choice has no visible impact to the VFIO user. Further qemu
> never implemented this and no other userspace user is known.
> 
> The original description in commit f5c9ecebaf2a ("vfio/iommu_type1: add
> new VFIO_TYPE1_NESTING_IOMMU IOMMU type") suggested this was to "provide
> SMMU translation services to the guest operating system" however the rest
> of the API to set the guest table pointer for the stage 1 was never
> completed, or at least never upstreamed, rendering this part useless dead
> code.
> 
> Since the current patches to enable nested translation, aka userspace page
> tables, rely on iommufd and will not use the enable_nesting()
> iommu_domain_op, remove this infrastructure. However, don't cut too deep
> into the SMMU drivers for now expecting the iommufd work to pick it up -
> we still need to create S2 IO page tables.
> 
> Remove VFIO_TYPE1_NESTING_IOMMU and everything under it including the
> enable_nesting iommu_domain_op.
> 
> Just in-case there is some userspace using this continue to treat
> requesting it as a NOP, but do not advertise support any more.
> 
> Signed-off-by: Jason Gunthorpe 
> ---
>  drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c | 16 
>  drivers/iommu/arm/arm-smmu/arm-smmu.c   | 16 
>  drivers/iommu/iommu.c   | 10 --
>  drivers/vfio/vfio_iommu_type1.c | 12 +---
>  include/linux/iommu.h   |  3 ---
>  include/uapi/linux/vfio.h   |  2 +-
>  6 files changed, 2 insertions(+), 57 deletions(-)
> 
> It would probably make sense for this to go through the VFIO tree with Robin's
> ack for the SMMU changes.

I'd be in favor of applying this, but it seems Robin and Eric are
looking for a stay of execution and I'd also be looking for an ack from
Joerg.  Thanks,

Alex

___
iommu mailing list
iommu@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/iommu


Re: [RESEND PATCH v8 00/11] Fix BUG_ON in vfio_iommu_group_notifier()

2022-05-13 Thread Alex Williamson
On Fri, 13 May 2022 17:49:44 +0200
Joerg Roedel  wrote:

> Hi Alex,
> 
> On Wed, May 04, 2022 at 10:29:56AM -0600, Alex Williamson wrote:
> > Done, and thanks for the heads-up.  Please try to cc me when the
> > vfio-notifier-fix branch is merged back into your next branch.  Thanks,  
> 
> This has happened now, the vfio-notifier-fix branch got the fix and is
> merged back into my next branch.

Thanks, Joerg!

Jason, I'll push a merge of this with

Subject: [PATCH] vfio: Delete container_q
0-v1-a1e8791d795b+6b-vfio_container_q_...@nvidia.com

and

Subject: [PATCH v3 0/8] Remove vfio_group from the struct file facing VFIO API
0-v3-f7729924a7ea+25e33-vfio_kvm_no_group_...@nvidia.com

as soon as my sanity build finishes.  Thanks,

Alex

___
iommu mailing list
iommu@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/iommu


Re: [PATCH] vfio: Stop using iommu_present()

2022-05-12 Thread Alex Williamson
On Tue,  5 Apr 2022 13:11:54 +0100
Robin Murphy  wrote:

> IOMMU groups have been mandatory for some time now, so a device without
> one is necessarily a device without any usable IOMMU, therefore the
> iommu_present() check is redundant (or at best unhelpful).
> 
> Signed-off-by: Robin Murphy 
> ---
>  drivers/vfio/vfio.c | 6 +++---
>  1 file changed, 3 insertions(+), 3 deletions(-)
> 
> diff --git a/drivers/vfio/vfio.c b/drivers/vfio/vfio.c
> index a4555014bd1e..7b0a7b85e77e 100644
> --- a/drivers/vfio/vfio.c
> +++ b/drivers/vfio/vfio.c
> @@ -745,11 +745,11 @@ static struct vfio_group 
> *vfio_group_find_or_alloc(struct device *dev)
>  
>   iommu_group = iommu_group_get(dev);
>  #ifdef CONFIG_VFIO_NOIOMMU
> - if (!iommu_group && noiommu && !iommu_present(dev->bus)) {
> + if (!iommu_group && noiommu) {
>   /*
>* With noiommu enabled, create an IOMMU group for devices that
> -  * don't already have one and don't have an iommu_ops on their
> -  * bus.  Taint the kernel because we're about to give a DMA
> +  * don't already have one, implying no IOMMU hardware/driver
> +  * exists.  Taint the kernel because we're about to give a DMA
>* capable device to a user without IOMMU protection.
>*/
>   group = vfio_noiommu_group_alloc(dev, VFIO_NO_IOMMU);

Applied to vfio next branch for v5.19.  Thanks,

Alex

___
iommu mailing list
iommu@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/iommu


Re: [RESEND PATCH v8 00/11] Fix BUG_ON in vfio_iommu_group_notifier()

2022-05-04 Thread Alex Williamson
On Wed, 4 May 2022 10:42:07 +0200
Joerg Roedel  wrote:

> On Mon, May 02, 2022 at 12:12:04PM -0400, Qian Cai wrote:
> > Reverting this series fixed an user-after-free while doing SR-IOV.
> > 
> >  BUG: KASAN: use-after-free in __lock_acquire  
> 
> Hrm, okay. I am going exclude this series from my next branch for now
> until this has been sorted out.
> 
> Alex, I suggest you do the same.

Done, and thanks for the heads-up.  Please try to cc me when the
vfio-notifier-fix branch is merged back into your next branch.  Thanks,

Alex

___
iommu mailing list
iommu@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/iommu


Re: [PATCH RFC 00/19] IOMMUFD Dirty Tracking

2022-05-02 Thread Alex Williamson
On Fri, 29 Apr 2022 05:45:20 +
"Tian, Kevin"  wrote:
> > From: Joao Martins 
> >  3) Unmapping an IOVA range while returning its dirty bit prior to
> > unmap. This case is specific for non-nested vIOMMU case where an
> > erronous guest (or device) DMAing to an address being unmapped at the
> > same time.  
> 
> an erroneous attempt like above cannot anticipate which DMAs can
> succeed in that window thus the end behavior is undefined. For an
> undefined behavior nothing will be broken by losing some bits dirtied
> in the window between reading back dirty bits of the range and
> actually calling unmap. From guest p.o.v. all those are black-box
> hardware logic to serve a virtual iotlb invalidation request which just
> cannot be completed in one cycle.
> 
> Hence in reality probably this is not required except to meet vfio
> compat requirement. Just in concept returning dirty bits at unmap
> is more accurate.
> 
> I'm slightly inclined to abandon it in iommufd uAPI.

Sorry, I'm not following why an unmap with returned dirty bitmap
operation is specific to a vIOMMU case, or in fact indicative of some
sort of erroneous, racy behavior of guest or device.  We need the
flexibility to support memory hot-unplug operations during migration,
but even in the vIOMMU case, isn't it fair for the VMM to ask whether a
device dirtied the range being unmapped?  This was implemented as a
single operation specifically to avoid races where ongoing access may be
available after retrieving a snapshot of the bitmap.  Thanks,

Alex

___
iommu mailing list
iommu@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/iommu


Re: [PATCH v8 00/11] Fix BUG_ON in vfio_iommu_group_notifier()

2022-04-08 Thread Alex Williamson
On Fri, 8 Apr 2022 10:59:22 -0500
Bjorn Helgaas  wrote:

> On Fri, Apr 08, 2022 at 05:37:16PM +0200, Joerg Roedel wrote:
> > On Fri, Apr 08, 2022 at 11:17:47AM -0300, Jason Gunthorpe wrote:  
> > > You might consider using a linear tree instead of the topic branches,
> > > topics are tricky and I'm not sure it helps a small subsystem so much.
> > > Conflicts between topics are a PITA for everyone, and it makes
> > > handling conflicts with rc much harder than it needs to be.  
> > 
> > I like the concept of a branch per driver, because with that I can just
> > exclude that branch from my next-merge when there are issues with it.
> > Conflicts between branches happen too, but they are quite manageable
> > when the branches have the same base.  
> 
> FWIW, I use the same topic branch approach for PCI.  I like the
> ability to squash in fixes or drop things without having to clutter
> the history with trivial commits and reverts.  I haven't found
> conflicts to be a problem.

Same.  I think I've generally modeled my branch handling after Bjorn
and Joerg, I don't always use topic branches, but will for larger
contributions and I don't generally find conflicts to be a problem.
I'm always open to adopting best practices though.  Thanks,

Alex

___
iommu mailing list
iommu@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/iommu


Re: [PATCH v2 4/4] vfio: Require that devices support DMA cache coherence

2022-04-08 Thread Alex Williamson
On Thu,  7 Apr 2022 12:23:47 -0300
Jason Gunthorpe  wrote:

> IOMMU_CACHE means that normal DMAs do not require any additional coherency
> mechanism and is the basic uAPI that VFIO exposes to userspace. For
> instance VFIO applications like DPDK will not work if additional coherency
> operations are required.
> 
> Therefore check IOMMU_CAP_CACHE_COHERENCY like vdpa & usnic do before
> allowing an IOMMU backed VFIO device to be created.
> 
> Signed-off-by: Jason Gunthorpe 
> ---
>  drivers/vfio/vfio.c | 7 +++
>  1 file changed, 7 insertions(+)
> 
> diff --git a/drivers/vfio/vfio.c b/drivers/vfio/vfio.c
> index a4555014bd1e72..9edad767cfdad3 100644
> --- a/drivers/vfio/vfio.c
> +++ b/drivers/vfio/vfio.c
> @@ -815,6 +815,13 @@ static int __vfio_register_dev(struct vfio_device 
> *device,
>  
>  int vfio_register_group_dev(struct vfio_device *device)
>  {
> + /*
> +  * VFIO always sets IOMMU_CACHE because we offer no way for userspace to
> +  * restore cache coherency.
> +  */
> + if (!iommu_capable(device->dev->bus, IOMMU_CAP_CACHE_COHERENCY))
> + return -EINVAL;
> +
>   return __vfio_register_dev(device,
>   vfio_group_find_or_alloc(device->dev));
>  }

Acked-by: Alex Williamson 

___
iommu mailing list
iommu@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/iommu


Re: [PATCH v2 2/4] vfio: Move the Intel no-snoop control off of IOMMU_CACHE

2022-04-08 Thread Alex Williamson
On Thu,  7 Apr 2022 12:23:45 -0300
Jason Gunthorpe  wrote:

> IOMMU_CACHE means "normal DMA to this iommu_domain's IOVA should be cache
> coherent" and is used by the DMA API. The definition allows for special
> non-coherent DMA to exist - ie processing of the no-snoop flag in PCIe
> TLPs - so long as this behavior is opt-in by the device driver.
> 
> The flag is mainly used by the DMA API to synchronize the IOMMU setting
> with the expected cache behavior of the DMA master. eg based on
> dev_is_dma_coherent() in some case.
> 
> For Intel IOMMU IOMMU_CACHE was redefined to mean 'force all DMA to be
> cache coherent' which has the practical effect of causing the IOMMU to
> ignore the no-snoop bit in a PCIe TLP.
> 
> x86 platforms are always IOMMU_CACHE, so Intel should ignore this flag.
> 
> Instead use the new domain op enforce_cache_coherency() which causes every
> IOPTE created in the domain to have the no-snoop blocking behavior.
> 
> Reconfigure VFIO to always use IOMMU_CACHE and call
> enforce_cache_coherency() to operate the special Intel behavior.
> 
> Remove the IOMMU_CACHE test from Intel IOMMU.
> 
> Ultimately VFIO plumbs the result of enforce_cache_coherency() back into
> the x86 platform code through kvm_arch_register_noncoherent_dma() which
> controls if the WBINVD instruction is available in the guest. No other
> arch implements kvm_arch_register_noncoherent_dma().

I think this last sentence is alluding to it, but I wish the user
visible change to VFIO_DMA_CC_IOMMU on non-x86 were more explicit.
Perhaps for the last sentence:

  No other archs implement kvm_arch_register_noncoherent_dma() nor are
  there any other known consumers of VFIO_DMA_CC_IOMMU that might be
  affected by the user visible result change on non-x86 archs.

Otherwise,

Acked-by: Alex Williamson 

> 
> Signed-off-by: Jason Gunthorpe 
> ---
>  drivers/iommu/intel/iommu.c |  7 ++-
>  drivers/vfio/vfio_iommu_type1.c | 30 +++---
>  include/linux/intel-iommu.h |  1 -
>  3 files changed, 21 insertions(+), 17 deletions(-)
> 
> diff --git a/drivers/iommu/intel/iommu.c b/drivers/iommu/intel/iommu.c
> index f08611a6cc4799..8f3674e997df06 100644
> --- a/drivers/iommu/intel/iommu.c
> +++ b/drivers/iommu/intel/iommu.c
> @@ -641,7 +641,6 @@ static unsigned long domain_super_pgsize_bitmap(struct 
> dmar_domain *domain)
>  static void domain_update_iommu_cap(struct dmar_domain *domain)
>  {
>   domain_update_iommu_coherency(domain);
> - domain->iommu_snooping = domain_update_iommu_snooping(NULL);
>   domain->iommu_superpage = domain_update_iommu_superpage(domain, NULL);
>  
>   /*
> @@ -4283,7 +4282,6 @@ static int md_domain_init(struct dmar_domain *domain, 
> int guest_width)
>   domain->agaw = width_to_agaw(adjust_width);
>  
>   domain->iommu_coherency = false;
> - domain->iommu_snooping = false;
>   domain->iommu_superpage = 0;
>   domain->max_addr = 0;
>  
> @@ -4422,8 +4420,7 @@ static int intel_iommu_map(struct iommu_domain *domain,
>   prot |= DMA_PTE_READ;
>   if (iommu_prot & IOMMU_WRITE)
>   prot |= DMA_PTE_WRITE;
> - if (((iommu_prot & IOMMU_CACHE) && dmar_domain->iommu_snooping) ||
> - dmar_domain->enforce_no_snoop)
> + if (dmar_domain->enforce_no_snoop)
>   prot |= DMA_PTE_SNP;
>  
>   max_addr = iova + size;
> @@ -4550,7 +4547,7 @@ static bool intel_iommu_enforce_cache_coherency(struct 
> iommu_domain *domain)
>  {
>   struct dmar_domain *dmar_domain = to_dmar_domain(domain);
>  
> - if (!dmar_domain->iommu_snooping)
> + if (!domain_update_iommu_snooping(NULL))
>   return false;
>   dmar_domain->enforce_no_snoop = true;
>   return true;
> diff --git a/drivers/vfio/vfio_iommu_type1.c b/drivers/vfio/vfio_iommu_type1.c
> index 9394aa9444c10c..c13b9290e35759 100644
> --- a/drivers/vfio/vfio_iommu_type1.c
> +++ b/drivers/vfio/vfio_iommu_type1.c
> @@ -84,8 +84,8 @@ struct vfio_domain {
>   struct iommu_domain *domain;
>   struct list_headnext;
>   struct list_headgroup_list;
> - int prot;   /* IOMMU_CACHE */
> - boolfgsp;   /* Fine-grained super pages */
> + boolfgsp : 1;   /* Fine-grained super pages */
> + boolenforce_cache_coherency : 1;
>  };
>  
>  struct vfio_dma {
> @@ -1461,7 +1461,7 @@ static int vfio_iommu_map(struct vfio_iommu *iommu, 
> dma_addr_t iova,
>  
>   list_for_each_entry(d, &iommu->domain_list, next

Re: [PATCH 1/5] iommu: Replace uses of IOMMU_CAP_CACHE_COHERENCY with dev_is_dma_coherent()

2022-04-07 Thread Alex Williamson
On Thu, 7 Apr 2022 12:23:31 -0300
Jason Gunthorpe  wrote:

> On Thu, Apr 07, 2022 at 04:17:11PM +0100, Robin Murphy wrote:
> 
> > For the specific case of overriding PCIe No Snoop (which is more problematic
> > from an Arm SMMU PoV) when assigning to a VM, would that not be easier
> > solved by just having vfio-pci clear the "Enable No Snoop" control bit in
> > the endpoint's PCIe capability?  
> 
> Ideally.
> 
> That was rediscussed recently, apparently there are non-compliant
> devices and drivers that just ignore the bit. 
> 
> Presumably this is why x86 had to move to an IOMMU enforced feature..

I considered this option when implementing the current solution, but
ultimately I didn't have confidence in being able to prevent drivers
from using device specific means to effect the change anyway.  GPUs
especially have various back channels to config space.  Thanks,

Alex

___
iommu mailing list
iommu@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/iommu


Re: [PATCH 3/5] iommu: Introduce the domain op enforce_cache_coherency()

2022-04-05 Thread Alex Williamson
On Tue,  5 Apr 2022 13:16:02 -0300
Jason Gunthorpe  wrote:

> This new mechanism will replace using IOMMU_CAP_CACHE_COHERENCY and
> IOMMU_CACHE to control the no-snoop blocking behavior of the IOMMU.
> 
> Currently only Intel and AMD IOMMUs are known to support this
> feature. They both implement it as an IOPTE bit, that when set, will cause
> PCIe TLPs to that IOVA with the no-snoop bit set to be treated as though
> the no-snoop bit was clear.
> 
> The new API is triggered by calling enforce_cache_coherency() before
> mapping any IOVA to the domain which globally switches on no-snoop
> blocking. This allows other implementations that might block no-snoop
> globally and outside the IOPTE - AMD also documents such an HW capability.
> 
> Leave AMD out of sync with Intel and have it block no-snoop even for
> in-kernel users. This can be trivially resolved in a follow up patch.
> 
> Only VFIO will call this new API.
> 
> Signed-off-by: Jason Gunthorpe 
> ---
>  drivers/iommu/amd/iommu.c   |  7 +++
>  drivers/iommu/intel/iommu.c | 14 +-
>  include/linux/intel-iommu.h |  1 +
>  include/linux/iommu.h   |  4 
>  4 files changed, 25 insertions(+), 1 deletion(-)
> 
> diff --git a/drivers/iommu/amd/iommu.c b/drivers/iommu/amd/iommu.c
> index a1ada7bff44e61..e500b487eb3429 100644
> --- a/drivers/iommu/amd/iommu.c
> +++ b/drivers/iommu/amd/iommu.c
> @@ -2271,6 +2271,12 @@ static int amd_iommu_def_domain_type(struct device 
> *dev)
>   return 0;
>  }
>  
> +static bool amd_iommu_enforce_cache_coherency(struct iommu_domain *domain)
> +{
> + /* IOMMU_PTE_FC is always set */
> + return true;
> +}
> +
>  const struct iommu_ops amd_iommu_ops = {
>   .capable = amd_iommu_capable,
>   .domain_alloc = amd_iommu_domain_alloc,
> @@ -2293,6 +2299,7 @@ const struct iommu_ops amd_iommu_ops = {
>   .flush_iotlb_all = amd_iommu_flush_iotlb_all,
>   .iotlb_sync = amd_iommu_iotlb_sync,
>   .free   = amd_iommu_domain_free,
> + .enforce_cache_coherency = amd_iommu_enforce_cache_coherency,
>   }
>  };
>  
> diff --git a/drivers/iommu/intel/iommu.c b/drivers/iommu/intel/iommu.c
> index df5c62ecf942b8..f08611a6cc4799 100644
> --- a/drivers/iommu/intel/iommu.c
> +++ b/drivers/iommu/intel/iommu.c
> @@ -4422,7 +4422,8 @@ static int intel_iommu_map(struct iommu_domain *domain,
>   prot |= DMA_PTE_READ;
>   if (iommu_prot & IOMMU_WRITE)
>   prot |= DMA_PTE_WRITE;
> - if ((iommu_prot & IOMMU_CACHE) && dmar_domain->iommu_snooping)
> + if (((iommu_prot & IOMMU_CACHE) && dmar_domain->iommu_snooping) ||
> + dmar_domain->enforce_no_snoop)
>   prot |= DMA_PTE_SNP;
>  
>   max_addr = iova + size;
> @@ -4545,6 +4546,16 @@ static phys_addr_t intel_iommu_iova_to_phys(struct 
> iommu_domain *domain,
>   return phys;
>  }
>  
> +static bool intel_iommu_enforce_cache_coherency(struct iommu_domain *domain)
> +{
> + struct dmar_domain *dmar_domain = to_dmar_domain(domain);
> +
> + if (!dmar_domain->iommu_snooping)
> + return false;
> + dmar_domain->enforce_no_snoop = true;
> + return true;
> +}

Don't we have issues if we try to set DMA_PTE_SNP on DMARs that don't
support it, ie. reserved register bit set in pte faults?  It seems
really inconsistent here that I could make a domain that supports
iommu_snooping, set enforce_no_snoop = true, then add another DMAR to
the domain that may not support iommu_snooping, I'd get false on the
subsequent enforcement test, but the dmar_domain is still trying to use
DMA_PTE_SNP.

There's also a disconnect, maybe just in the naming or documentation,
but if I call enforce_cache_coherency for a domain, that seems like the
domain should retain those semantics regardless of how it's modified,
ie. "enforced".  For example, if I tried to perform the above operation,
I should get a failure attaching the device that brings in the less
capable DMAR because the domain has been set to enforce this feature.

If the API is that I need to re-enforce_cache_coherency on every
modification of the domain, shouldn't dmar_domain->enforce_no_snoop
also return to a default value on domain changes?

Maybe this should be something like set_no_snoop_squashing with the
above semantics, it needs to be re-applied whenever the domain:device
composition changes?  Thanks,

Alex

> +
>  static bool intel_iommu_capable(enum iommu_cap cap)
>  {
>   if (cap == IOMMU_CAP_CACHE_COHERENCY)
> @@ -4898,6 +4909,7 @@ const struct iommu_ops intel_iommu_ops = {
>   .iotlb_sync = intel_iommu_tlb_sync,
>   .iova_to_phys   = intel_iommu_iova_to_phys,
>   .free   = intel_iommu_domain_free,
> + .enforce_cache_coherency = intel_iommu_enforce_cache_coherency,
>   }
>  };
>  
> diff --git a/include/linux/intel-iommu.h b/include/linux/intel-iommu.h
> index 2f9891cb3d0014..1f930c0c225d94 100644

Re: [PATCH 2/5] vfio: Require that devices support DMA cache coherence

2022-04-05 Thread Alex Williamson
On Tue,  5 Apr 2022 13:16:01 -0300
Jason Gunthorpe  wrote:

> dev_is_dma_coherent() is the control to determine if IOMMU_CACHE can be
> supported.
> 
> IOMMU_CACHE means that normal DMAs do not require any additional coherency
> mechanism and is the basic uAPI that VFIO exposes to userspace. For
> instance VFIO applications like DPDK will not work if additional coherency
> operations are required.
> 
> Therefore check dev_is_dma_coherent() before allowing a device to join a
> domain. This will block device/platform/iommu combinations from using VFIO
> that do not support cache coherent DMA.
> 
> Signed-off-by: Jason Gunthorpe 
> ---
>  drivers/vfio/vfio.c | 6 ++
>  1 file changed, 6 insertions(+)
> 
> diff --git a/drivers/vfio/vfio.c b/drivers/vfio/vfio.c
> index a4555014bd1e72..2a3aa3e742d943 100644
> --- a/drivers/vfio/vfio.c
> +++ b/drivers/vfio/vfio.c
> @@ -32,6 +32,7 @@
>  #include 
>  #include 
>  #include 
> +#include 
>  #include "vfio.h"
>  
>  #define DRIVER_VERSION   "0.3"
> @@ -1348,6 +1349,11 @@ static int vfio_group_get_device_fd(struct vfio_group 
> *group, char *buf)
>   if (IS_ERR(device))
>   return PTR_ERR(device);
>  
> + if (group->type == VFIO_IOMMU && !dev_is_dma_coherent(device->dev)) {
> + ret = -ENODEV;
> + goto err_device_put;
> + }
> +

Failing at the point where the user is trying to gain access to the
device seems a little late in the process and opaque, wouldn't we
rather have vfio bus drivers fail to probe such devices?  I'd expect
this to occur in the vfio_register_group_dev() path.  Thanks,

Alex

>   if (!try_module_get(device->dev->driver->owner)) {
>   ret = -ENODEV;
>   goto err_device_put;

___
iommu mailing list
iommu@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/iommu


Re: [PATCH RFC v2 02/11] iommu: Add iommu_group_singleton_lockdown()

2022-03-30 Thread Alex Williamson
On Wed, 30 Mar 2022 14:18:47 +
"Tian, Kevin"  wrote:

> +Alex
> 
> > From: Tian, Kevin
> > Sent: Wednesday, March 30, 2022 10:13 PM
> >   
> > > From: Jason Gunthorpe
> > > Sent: Wednesday, March 30, 2022 7:58 PM
> > >
> > > On Wed, Mar 30, 2022 at 06:50:11AM +, Tian, Kevin wrote:
> > >  
> > > > One thing that I'm not very sure is about DMA alias. Even when 
> > > > physically
> > > > there is only a single device within the group the aliasing could lead
> > > > to multiple RIDs in the group making it non-singleton. But probably we
> > > > don't need support SVA on such device until a real demand comes?  
> > >
> > > How can we have multiple RIDs in the same group and have only one
> > > device in the group?  
> > 
> > Alex may help throw some insight here. Per what I read from the code
> > looks like certain device can generate traffic with multiple RIDs.

You only need to look so far as the dma alias quirks to find devices
that use the wrong RID for DMA.  In general I don't think we have
enough confidence to say that for all these devices the wrong RID is
exclusively used versus some combination of both RIDs.  Also, the
aliased RID is not always a physical device.  Thanks,

Alex

___
iommu mailing list
iommu@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/iommu


Re: Bug report: VFIO map/unmep mem subject to race and DMA data goes to incorrect page (4.18.0)

2022-03-28 Thread Alex Williamson
On Mon, 28 Mar 2022 12:14:51 -0700
"Daniel F. Smith"  wrote:

> Hi Alex,
> 
> Answers to questions I can answer are in-line.  First an apology
> though---the machine with the FPGA board is 1000 miles remote, and I don't
> have root access.  It's unlikely I will be able to do kernel patch testing.
> 
> 
> Alex Williamson scribed the following, on or around Fri, Mar 25, 2022 at 
> 04:10:22PM -0600:
> > Hi Daniel,
> >   
> ...
> >
> > Coherency possibly.
> > 
> > There's a possible coherency issue at the compare depending on the
> > IOMMU capabilities which could affect whether DMA is coherent to memory
> > or requires an explicit flush.  I'm a little suspicious whether dmar0
> > is really the IOMMU controlling this device since you mention a 39bit
> > IOVA space, which is more typical of Intel client platforms which can
> > also have integrated graphics which often have a dedicated IOMMU at
> > dmar0 that isn't necessarily representative of the other IOMMUs in the
> > system, especially with regard to snoop-control.  Each dmar lists the
> > managed devices under it in sysfs to verify.  Support for snoop-control
> > would be identified in the ecap register rather than the cap register.
> > VFIO can also report coherency via the VFIO_DMA_CC_IOMMU extension
> > reported by VFIO_CHECK_EXTENSION ioctl.  
> 
> $ cat /sys/devices/virtual/iommu/dmar0/intel-iommu/cap
> d2008c40660462
> $ cat /sys/devices/virtual/iommu/dmar0/intel-iommu/ecap
> f050da
> $ lscpu | grep Model
> Model:   165
> Model name:  Intel(R) Xeon(R) W-1290P CPU @ 3.70GHz
> $ ls -l /sys/devices/virtual/iommu/dmar0/devices | wc -l
> 24
> $ ... ioctl(container_fd, VFIO_CHECK_EXTENSION, VFIO_DMA_CC_IOMMU)
> 0

Your ecap register reports bit 7 (Snoop Control) set, which should mean
that VT-d is enforcing coherency regardless of no-snoop transactions.
I suspect maybe the different result from the ioctl could be from
testing this extension before the IOMMU has been set for the
container(?)

> What are the implications of having no "IOMMU enforces DMA cache
> conherence"?  On this machine there is no access to a PCIe bus analyzer, but
> it's very unlikely that the TLPs would have NoSnoop set.

There's also bit 11 (Enable No Snoop) that could be cleared in the PCI
device control register, which would theoretically prevent the device
from using no-snoop TLPs.
 
> Is there a good way How can I tell what IOMMU I'm using?

Which DMAR?  Like this for example:

$ readlink -f /sys/bus/pci/devices/:04:00.0/iommu
/sys/devices/virtual/iommu/dmar3

Your listing of devices piped to wc would also reciprocally list the
device in that output.  With 24 devices there's a fair chance that
dmar0 is the only one used.

> (I did think it was strange that the IOMMU in this machine cannot handle
> enough bits for mapping IOVA==VMA.  The test code is running in a podman
> container, but (naively) I wouldn't expect that to make a difference.)

Single socket Xeon processors like this tend to share more similarities
to consumer desktop processors than to the "Scalable" line of Xeons.

FWIW, there's a proposal[1] for a new, shared userspace IOMMU interface
that includes an option for the kernel to allocate IOVAs for these
cases.

[1]https://lore.kernel.org/all/0-v1-e79cd8d168e8+6-iommufd_...@nvidia.com/

> > However, CPU coherency might lead to a miscompare, but not necessarily a
> > miscompare matching the previous iteration.  Still, for completeness
> > let's make sure this isn't a gap in the test programming making invalid
> > assumptions about CPU/DMA coherency.
> > 
> > The fact that randomizing the IOVA provides a workaround though might
> > suggest something relative to the IOMMU page table coherency.  But for
> > the new mmap target to have the data from the previous iteration, the
> > IOMMU PTE would need to be stale on read, but correct on write in order
> > to land back in your new mmap.  That seems peculiar.  Are we sure the
> > FPGA device isn't caching the value at the IOVA or using any sort of
> > IOTLB caching such as ATS that might not be working correctly?  
> 
> I cannot say for certain what the FPGA caches, if anything.  The IP for that
> part is closed (search for Xilinx PG302 QDMA).  It should (!) be
> well-tested... oh for an analyzer!
> 
> > > Suggestion: Document issue when using fixed IOVA, or fix if security
> > > is a concern.  
> > 
> > I don't know that there's enough information here to make any
> > conclusions.  Here are some further questions:
> > 
> >  * What size mappings are being used, both for th

Re: [PATCH RFC 08/12] iommufd: IOCTLs for the io_pagetable

2022-03-28 Thread Alex Williamson
On Mon, 28 Mar 2022 16:47:49 -0300
Jason Gunthorpe  wrote:

> On Mon, Mar 28, 2022 at 03:57:53PM -0300, Jason Gunthorpe wrote:
> 
> > So, currently AMD and Intel have exactly the same HW feature with a
> > different kAPI..  
> 
> I fixed it like below and made the ordering changes Kevin pointed
> to. Will send next week after the merge window:
> 
> 527e438a974a06 iommu: Delete IOMMU_CAP_CACHE_COHERENCY
> 5cbc8603ffdf20 vfio: Move the Intel no-snoop control off of IOMMU_CACHE
> ebc961f93d1af3 iommu: Introduce the domain op enforce_cache_coherency()
> 79c52a2bb1e60b vfio: Require that devices support DMA cache coherence
> 02168f961b6a75 iommu: Replace uses of IOMMU_CAP_CACHE_COHERENCY with 
> dev_is_dma_coherent()
> 
> '79c can be avoided, we'd just drive IOMMU_CACHE off of
> dev_is_dma_coherent() - but if we do that I'd like to properly
> document the arch/iommu/platform/kvm combination that is using this..

We can try to enforce dev_is_dma_coherent(), as you note it's not going
to affect any x86 users.  arm64 is the only obviously relevant arch that
defines ARCH_HAS_SYNC_DMA_FOR_{DEVICE,CPU} but the device.dma_coherent
setting comes from ACPI/OF firmware, so someone from ARM land will need
to shout if this is an issue.  I think we'd need to back off and go
with documentation if a broken use case shows up.  Thanks,

Alex

 
> diff --git a/drivers/iommu/amd/iommu.c b/drivers/iommu/amd/iommu.c
> index 3c0ac3c34a7f9a..f144eb9fea8e31 100644
> --- a/drivers/iommu/amd/iommu.c
> +++ b/drivers/iommu/amd/iommu.c
> @@ -2269,6 +2269,12 @@ static int amd_iommu_def_domain_type(struct device 
> *dev)
>   return 0;
>  }
>  
> +static bool amd_iommu_enforce_cache_coherency(struct iommu_domain *domain)
> +{
> + /* IOMMU_PTE_FC is always set */
> + return true;
> +}
> +
>  const struct iommu_ops amd_iommu_ops = {
>   .capable = amd_iommu_capable,
>   .domain_alloc = amd_iommu_domain_alloc,
> @@ -2291,6 +2297,7 @@ const struct iommu_ops amd_iommu_ops = {
>   .flush_iotlb_all = amd_iommu_flush_iotlb_all,
>   .iotlb_sync = amd_iommu_iotlb_sync,
>   .free   = amd_iommu_domain_free,
> + .enforce_cache_coherency = amd_iommu_enforce_cache_coherency,
>   }
>  };
> 
> Thanks,
> Jason
> 

___
iommu mailing list
iommu@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/iommu


Re: [PATCH RFC 08/12] iommufd: IOCTLs for the io_pagetable

2022-03-28 Thread Alex Williamson
On Thu, 24 Mar 2022 10:46:22 -0300
Jason Gunthorpe  wrote:

> On Thu, Mar 24, 2022 at 07:25:03AM +, Tian, Kevin wrote:
> 
> > Based on that here is a quick tweak of the force-snoop part (not compiled). 
> >  
> 
> I liked your previous idea better, that IOMMU_CAP_CACHE_COHERENCY
> started out OK but got weird. So lets fix it back to the way it was.
> 
> How about this:
> 
> https://github.com/jgunthorpe/linux/commits/intel_no_snoop
> 
> b11c19a4b34c2a iommu: Move the Intel no-snoop control off of IOMMU_CACHE
> 5263947f9d5f36 vfio: Require that device support DMA cache coherence

I have some issues with the argument here:

  This will block device/platform/iommu combinations that do not
  support cache coherent DMA - but these never worked anyhow as VFIO
  did not expose any interface to perform the required cache
  maintenance operations.

VFIO never intended to provide such operations, it only tried to make
the coherence of the device visible to userspace such that it can
perform operations via other means, for example via KVM.  The "never
worked" statement here seems false.

Commit b11c19a4b34c2a also appears to be a behavioral change.  AIUI
vfio_domain.enforce_cache_coherency would only be set on Intel VT-d
where snoop-control is supported, this translates to KVM emulating
coherency instructions everywhere except VT-d w/ snoop-control.

My understanding of AMD-Vi is that no-snoop TLPs are always coherent, so
this would trigger unnecessary wbinvd emulation on those platforms.  I
don't know if other archs need similar, but it seems we're changing
polarity wrt no-snoop TLPs from "everyone is coherent except this case
on Intel" to "everyone is non-coherent except this opposite case on
Intel".  Thanks,

Alex

> eab4b381c64a30 iommu: Restore IOMMU_CAP_CACHE_COHERENCY to its original 
> meaning
> 2752e12bed48f6 iommu: Replace uses of IOMMU_CAP_CACHE_COHERENCY with 
> dev_is_dma_coherent()
> 
> If you like it could you take it from here?
> 
> Thanks,
> Jason
> 

___
iommu mailing list
iommu@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/iommu


Re: Bug report: VFIO map/unmep mem subject to race and DMA data goes to incorrect page (4.18.0)

2022-03-25 Thread Alex Williamson
Hi Daniel,

On Fri, 25 Mar 2022 13:06:40 -0700
"Daniel F. Smith"  wrote:

> This email is to document an insidious (incorrect data, no error or warning)
> VFIO bug found when using the Intel IOMMU to perform DMA transfers; and the
> associated workaround.
> 
> There may be security implications (unsure).
> 
> /sys/devices/virtual/iommu/dmar0/intel-iommu/version: 1:0
> /sys/devices/virtual/iommu/dmar0/intel-iommu/cap: d2008c40660462
> Linux x.ibm.com 4.18.0-348.20.1.el8_5.x86_64 #1 SMP Tue Mar 8 12:56:54 
> EST 2022 x86_64 x86_64 x86_64 GNU/Linux
> Red Hat Enterprise Linux release 8.5 (Ootpa)
> 
> In our testing of VFIO DMA to an FPGA card in rootless mode, we discovered a
> glitch where DMA data are transferred to/from the incorrect page.  It
> appears timing based.  Under some specific conditions the test could trigger
> the bug every loop.  Sometimes the bug would only emerge after 20+ minutes
> of testing.
> 
> Basics of test:
>   Get memory with mmap(anonymous): size can change.
>   VFIO_IOMMU_MAP_DMA with a block of memory, fixed IOVA.
>   Fill memory with pattern.
>   Do DMA transfer to FPGA from memory at IOVA.
>   Do DMA transfer from FPGA to memory at IOVA+offset.
>   Compare memory to ensure match.  Miscompare is bug.
>   VFIO_IOMMU_UNMAP_DMA 
>   unmap()
>   Repeat.
> 
> Using the fixed IOVA address* caused sporadic memory miscompares.  The
> nature of the miscompares is that the received data was mixed with pages
> that had been returned by mmap in a *previous* loop.
> 
> Workaround: Randomizing the IOVA eliminated the memory miscompares.
> 
> Hypothesis/conjecture: Possible race condition in UNMAP_DMA such that pages
> can be released/munlocked *after* the MAP_DMA with the same IOVA has
> occurred.

Coherency possibly.

There's a possible coherency issue at the compare depending on the
IOMMU capabilities which could affect whether DMA is coherent to memory
or requires an explicit flush.  I'm a little suspicious whether dmar0
is really the IOMMU controlling this device since you mention a 39bit
IOVA space, which is more typical of Intel client platforms which can
also have integrated graphics which often have a dedicated IOMMU at
dmar0 that isn't necessarily representative of the other IOMMUs in the
system, especially with regard to snoop-control.  Each dmar lists the
managed devices under it in sysfs to verify.  Support for snoop-control
would be identified in the ecap register rather than the cap register.
VFIO can also report coherency via the VFIO_DMA_CC_IOMMU extension
reported by VFIO_CHECK_EXTENSION ioctl.

However, CPU coherency might lead to a miscompare, but not necessarily a
miscompare matching the previous iteration.  Still, for completeness
let's make sure this isn't a gap in the test programming making invalid
assumptions about CPU/DMA coherency.

The fact that randomizing the IOVA provides a workaround though might
suggest something relative to the IOMMU page table coherency.  But for
the new mmap target to have the data from the previous iteration, the
IOMMU PTE would need to be stale on read, but correct on write in order
to land back in your new mmap.  That seems peculiar.  Are we sure the
FPGA device isn't caching the value at the IOVA or using any sort of
IOTLB caching such as ATS that might not be working correctly?

> Suggestion: Document issue when using fixed IOVA, or fix if security
> is a concern.

I don't know that there's enough information here to make any
conclusions.  Here are some further questions:

 * What size mappings are being used, both for the mmap and the VFIO
   MAP/UNMAP operations.

 * If the above is venturing into super page support (2MB), does the
   vfio_iommu_type1 module option disable_hugepages=1 affect the
   results.

 * Along the same lines, does the kernel command line option
   intel_iommu=sp_off produce different results.

 * Does this behavior also occur on upstream kernels (ie. v5.17)?

 * Do additional CPU cache flushes in the test program produce different
   results?

 * Is this a consumer available FPGA device that others might be able
   to reproduce this issue?  I've always wanted such a device for
   testing, but also we can't rule out that the FPGA itself or its
   programming is the source of the miscompare.

>From the vfio perspective, UNMAP_DMA should first unmap the pages at
the IOMMU to prevent device access before unpinning the pages.  We do
make use of batch unmapping to reduce iotlb flushing, but the result is
expected to be that the IOMMU PTE entries are invalidated before the
UNMAP_DMA operation completes.  A stale IOVA would not be expected or
correct operation.  Thanks,

Alex

___
iommu mailing list
iommu@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/iommu


Re: [PATCH RFC 04/12] kernel/user: Allow user::locked_vm to be usable for iommufd

2022-03-24 Thread Alex Williamson
On Thu, 24 Mar 2022 19:27:39 -0300
Jason Gunthorpe  wrote:

> On Thu, Mar 24, 2022 at 02:40:15PM -0600, Alex Williamson wrote:
> > On Tue, 22 Mar 2022 13:15:21 -0300
> > Jason Gunthorpe via iommu  wrote:
> >   
> > > On Tue, Mar 22, 2022 at 09:29:23AM -0600, Alex Williamson wrote:
> > >   
> > > > I'm still picking my way through the series, but the later compat
> > > > interface doesn't mention this difference as an outstanding issue.
> > > > Doesn't this difference need to be accounted in how libvirt manages VM
> > > > resource limits?  
> > > 
> > > AFACIT, no, but it should be checked.
> > >   
> > > > AIUI libvirt uses some form of prlimit(2) to set process locked
> > > > memory limits.
> > > 
> > > Yes, and ulimit does work fully. prlimit adjusts the value:
> > > 
> > > int do_prlimit(struct task_struct *tsk, unsigned int resource,
> > >   struct rlimit *new_rlim, struct rlimit *old_rlim)
> > > {
> > >   rlim = tsk->signal->rlim + resource;
> > > [..]
> > >   if (new_rlim)
> > >   *rlim = *new_rlim;
> > > 
> > > Which vfio reads back here:
> > > 
> > > drivers/vfio/vfio_iommu_type1.c:unsigned long pfn, limit = 
> > > rlimit(RLIMIT_MEMLOCK) >> PAGE_SHIFT;
> > > drivers/vfio/vfio_iommu_type1.c:unsigned long limit = 
> > > rlimit(RLIMIT_MEMLOCK) >> PAGE_SHIFT;
> > > 
> > > And iommufd does the same read back:
> > > 
> > >   lock_limit =
> > >   task_rlimit(pages->source_task, RLIMIT_MEMLOCK) >> PAGE_SHIFT;
> > >   npages = pages->npinned - pages->last_npinned;
> > >   do {
> > >   cur_pages = atomic_long_read(&pages->source_user->locked_vm);
> > >   new_pages = cur_pages + npages;
> > >   if (new_pages > lock_limit)
> > >   return -ENOMEM;
> > >   } while (atomic_long_cmpxchg(&pages->source_user->locked_vm, cur_pages,
> > >new_pages) != cur_pages);
> > > 
> > > So it does work essentially the same.  
> > 
> > Well, except for the part about vfio updating mm->locked_vm and iommufd
> > updating user->locked_vm, a per-process counter versus a per-user
> > counter.  prlimit specifically sets process resource limits, which get
> > reflected in task_rlimit.  
> 
> Indeed, but that is not how the majority of other things seem to
> operate it.
> 
> > For example, let's say a user has two 4GB VMs and they're hot-adding
> > vfio devices to each of them, so libvirt needs to dynamically modify
> > the locked memory limit for each VM.  AIUI, libvirt would look at the
> > VM size and call prlimit to set that value.  If libvirt does this to
> > both VMs, then each has a task_rlimit of 4GB.  In vfio we add pinned
> > pages to mm->locked_vm, so this works well.  In the iommufd loop above,
> > we're comparing a per-task/process limit to a per-user counter.  So I'm
> > a bit lost how both VMs can pin their pages here.  
> 
> I don't know anything about libvirt - it seems strange to use a
> securityish feature like ulimit but not security isolate processes
> with real users.
> 
> But if it really does this then it really does this.
> 
> So at the very least VFIO container has to keep working this way.
> 
> The next question is if we want iommufd's own device node to work this
> way and try to change libvirt somehow. It seems libvirt will have to
> deal with this at some point as iouring will trigger the same problem.
> 
> > > This whole area is a bit peculiar (eg mlock itself works differently),
> > > IMHO, but with most of the places doing pins voting to use
> > > user->locked_vm as the charge it seems the right path in today's
> > > kernel.  
> > 
> > The philosophy of whether it's ultimately a better choice for the
> > kernel aside, if userspace breaks because we're accounting in a
> > per-user pool rather than a per-process pool, then our compatibility
> > layer ain't so transparent.  
> 
> Sure, if it doesn't work it doesn't work. Lets be sure and clearly
> document what the compatability issue is and then we have to keep it
> per-process.
> 
> And the same reasoning likely means I can't change RDMA either as qemu
> will break just as well when qemu uses rdma mode.
> 
> Which is pretty sucky, but it is what it is..

I added Daniel Berrangé to the cc list for my previous reply, hopefully
he can comment whether libvirt has the sort of user security model you
allude to above that maybe makes this a non-issue for this use case.
Unfortunately it's extremely difficult to prove that there are no such
use cases out there even if libvirt is ok.  Thanks,

Alex

___
iommu mailing list
iommu@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/iommu

Re: [PATCH RFC 11/12] iommufd: vfio container FD ioctl compatibility

2022-03-24 Thread Alex Williamson
On Wed, 23 Mar 2022 21:33:42 -0300
Jason Gunthorpe  wrote:

> On Wed, Mar 23, 2022 at 04:51:25PM -0600, Alex Williamson wrote:
> 
> > My overall question here would be whether we can actually achieve a
> > compatibility interface that has sufficient feature transparency that we
> > can dump vfio code in favor of this interface, or will there be enough
> > niche use cases that we need to keep type1 and vfio containers around
> > through a deprecation process?  
> 
> Other than SPAPR, I think we can.

Does this mean #ifdef CONFIG_PPC in vfio core to retain infrastructure
for POWER support?

> > The locked memory differences for one seem like something that
> > libvirt wouldn't want hidden  
> 
> I'm first interested to have an understanding how this change becomes
> a real problem in practice that requires libvirt to do something
> different for vfio or iommufd. We can discuss in the other thread
> 
> If this is the make or break point then I think we can deal with it
> either by going back to what vfio does now or perhaps some other
> friendly compat approach..
> 
> > and we have questions regarding support for vaddr hijacking  
> 
> I'm not sure what vaddr hijacking is? Do you mean
> VFIO_DMA_MAP_FLAG_VADDR ? There is a comment that outlines my plan to
> implement it in a functionally compatible way without the deadlock
> problem. I estimate this as a small project.
> 
> > and different ideas how to implement dirty page tracking,   
> 
> I don't think this is compatibility. No kernel today triggers qemu to
> use this feature as no kernel supports live migration. No existing
> qemu will trigger this feature with new kernels that support live
> migration v2. Therefore we can adjust qemu's dirty tracking at the
> same time we enable migration v2 in qemu.

I guess I was assuming that enabling v2 migration in QEMU was dependent
on the existing type1 dirty tracking because it's the only means we
have to tell QEMU that all memory is perpetually dirty when we have a
DMA device.  Is that not correct?  If we don't intend to carry type1
dirty tracking into iommufd compatibility and we need it for this
purpose, then our window for being able to rip it out entirely closes
when QEMU gains v2 migration support.

> With Joao's work we are close to having a solid RFC to come with
> something that can be fully implemented.
> 
> Hopefully we can agree to this soon enough that qemu can come with a
> full package of migration v2 support including the dirty tracking
> solution.
> 
> > not to mention the missing features that are currently well used,
> > like p2p mappings, coherency tracking, mdev, etc.  
> 
> I consider these all mandatory things, they won't be left out.
> 
> The reason they are not in the RFC is mostly because supporting them
> requires work outside just this iommufd area, and I'd like this series
> to remain self-contained.
> 
> I've already got a draft to add DMABUF support to VFIO PCI which
> nicely solves the follow_pfn security problem, we want to do this for
> another reason already. I'm waiting for some testing feedback before
> posting it. Need some help from Daniel make the DMABUF revoke semantic
> him and I have been talking about. In the worst case can copy the
> follow_pfn approach.
> 
> Intel no-snoop is simple enough, just needs some Intel cleanup parts.
> 
> mdev will come along with the final VFIO integration, all the really
> hard parts are done already. The VFIO integration is a medium sized
> task overall.
> 
> So, I'm not ready to give up yet :)

Ok, that's a more promising outlook than I was inferring from the long
list of missing features.

> > Where do we focus attention?  Is symlinking device files our proposal
> > to userspace and is that something achievable, or do we want to use
> > this compatibility interface as a means to test the interface and
> > allow userspace to make use of it for transition, if their use cases
> > allow it, perhaps eventually performing the symlink after deprecation
> > and eventual removal of the vfio container and type1 code?  Thanks,  
> 
> symlinking device files is definitely just a suggested way to expedite
> testing.
> 
> Things like qemu that are learning to use iommufd-only features should
> learn to directly open iommufd instead of vfio container to activate
> those features.

Which is kind of the basis for my question, QEMU is racing for native
support, Eric and Yi are already working on this, so some of these
compatibility interfaces might only have short term usefulness.

> Looking long down the road I don't think we want to have type 1 and
> iommufd code forever.

Agreed.

> So

Re: [PATCH RFC 04/12] kernel/user: Allow user::locked_vm to be usable for iommufd

2022-03-24 Thread Alex Williamson
On Tue, 22 Mar 2022 13:15:21 -0300
Jason Gunthorpe via iommu  wrote:

> On Tue, Mar 22, 2022 at 09:29:23AM -0600, Alex Williamson wrote:
> 
> > I'm still picking my way through the series, but the later compat
> > interface doesn't mention this difference as an outstanding issue.
> > Doesn't this difference need to be accounted in how libvirt manages VM
> > resource limits?
> 
> AFACIT, no, but it should be checked.
> 
> > AIUI libvirt uses some form of prlimit(2) to set process locked
> > memory limits.  
> 
> Yes, and ulimit does work fully. prlimit adjusts the value:
> 
> int do_prlimit(struct task_struct *tsk, unsigned int resource,
>   struct rlimit *new_rlim, struct rlimit *old_rlim)
> {
>   rlim = tsk->signal->rlim + resource;
> [..]
>   if (new_rlim)
>   *rlim = *new_rlim;
> 
> Which vfio reads back here:
> 
> drivers/vfio/vfio_iommu_type1.c:unsigned long pfn, limit = 
> rlimit(RLIMIT_MEMLOCK) >> PAGE_SHIFT;
> drivers/vfio/vfio_iommu_type1.c:unsigned long limit = 
> rlimit(RLIMIT_MEMLOCK) >> PAGE_SHIFT;
> 
> And iommufd does the same read back:
> 
>   lock_limit =
>   task_rlimit(pages->source_task, RLIMIT_MEMLOCK) >> PAGE_SHIFT;
>   npages = pages->npinned - pages->last_npinned;
>   do {
>   cur_pages = atomic_long_read(&pages->source_user->locked_vm);
>   new_pages = cur_pages + npages;
>   if (new_pages > lock_limit)
>   return -ENOMEM;
>   } while (atomic_long_cmpxchg(&pages->source_user->locked_vm, cur_pages,
>new_pages) != cur_pages);
> 
> So it does work essentially the same.

Well, except for the part about vfio updating mm->locked_vm and iommufd
updating user->locked_vm, a per-process counter versus a per-user
counter.  prlimit specifically sets process resource limits, which get
reflected in task_rlimit.

For example, let's say a user has two 4GB VMs and they're hot-adding
vfio devices to each of them, so libvirt needs to dynamically modify
the locked memory limit for each VM.  AIUI, libvirt would look at the
VM size and call prlimit to set that value.  If libvirt does this to
both VMs, then each has a task_rlimit of 4GB.  In vfio we add pinned
pages to mm->locked_vm, so this works well.  In the iommufd loop above,
we're comparing a per-task/process limit to a per-user counter.  So I'm
a bit lost how both VMs can pin their pages here.

Am I missing some assumption about how libvirt users prlimit or
sandboxes users?

> The difference is more subtle, iouring/etc puts the charge in the user
> so it is additive with things like iouring and additively spans all
> the users processes.
> 
> However vfio is accounting only per-process and only for itself - no
> other subsystem uses locked as the charge variable for DMA pins.
> 
> The user visible difference will be that a limit X that worked with
> VFIO may start to fail after a kernel upgrade as the charge accounting
> is now cross user and additive with things like iommufd.

And that's exactly the concern.
 
> This whole area is a bit peculiar (eg mlock itself works differently),
> IMHO, but with most of the places doing pins voting to use
> user->locked_vm as the charge it seems the right path in today's
> kernel.

The philosophy of whether it's ultimately a better choice for the
kernel aside, if userspace breaks because we're accounting in a
per-user pool rather than a per-process pool, then our compatibility
layer ain't so transparent.

> Ceratinly having qemu concurrently using three different subsystems
> (vfio, rdma, iouring) issuing FOLL_LONGTERM and all accounting for
> RLIMIT_MEMLOCK differently cannot be sane or correct.

I think everyone would agree with that, but it also seems there are
real differences between task_rlimits and per-user vs per-process
accounting buckets and I'm confused how that's not a blocker for trying
to implement transparent compatibility.  Thanks,

Alex

___
iommu mailing list
iommu@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/iommu


Re: [PATCH RFC 11/12] iommufd: vfio container FD ioctl compatibility

2022-03-23 Thread Alex Williamson
On Fri, 18 Mar 2022 14:27:36 -0300
Jason Gunthorpe  wrote:

> iommufd can directly implement the /dev/vfio/vfio container IOCTLs by
> mapping them into io_pagetable operations. Doing so allows the use of
> iommufd by symliking /dev/vfio/vfio to /dev/iommufd. Allowing VFIO to
> SET_CONTAINER using a iommufd instead of a container fd is a followup
> series.
> 
> Internally the compatibility API uses a normal IOAS object that, like
> vfio, is automatically allocated when the first device is
> attached.
> 
> Userspace can also query or set this IOAS object directly using the
> IOMMU_VFIO_IOAS ioctl. This allows mixing and matching new iommufd only
> features while still using the VFIO style map/unmap ioctls.
> 
> While this is enough to operate qemu, it is still a bit of a WIP with a
> few gaps to be resolved:
> 
>  - Only the TYPE1v2 mode is supported where unmap cannot punch holes or
>split areas. The old mode can be implemented with a new operation to
>split an iopt_area into two without disturbing the iopt_pages or the
>domains, then unmapping a whole area as normal.
> 
>  - Resource limits rely on memory cgroups to bound what userspace can do
>instead of the module parameter dma_entry_limit.
> 
>  - VFIO P2P is not implemented. Avoiding the follow_pfn() mis-design will
>require some additional work to properly expose PFN lifecycle between
>VFIO and iommfd
> 
>  - Various components of the mdev API are not completed yet
>
>  - Indefinite suspend of SW access (VFIO_DMA_MAP_FLAG_VADDR) is not
>implemented.
> 
>  - The 'dirty tracking' is not implemented
> 
>  - A full audit for pedantic compatibility details (eg errnos, etc) has
>not yet been done
> 
>  - powerpc SPAPR is left out, as it is not connected to the iommu_domain
>framework. My hope is that SPAPR will be moved into the iommu_domain
>framework as a special HW specific type and would expect power to
>support the generic interface through a normal iommu_domain.

My overall question here would be whether we can actually achieve a
compatibility interface that has sufficient feature transparency that we
can dump vfio code in favor of this interface, or will there be enough
niche use cases that we need to keep type1 and vfio containers around
through a deprecation process?

The locked memory differences for one seem like something that libvirt
wouldn't want hidden and we have questions regarding support for vaddr
hijacking and different ideas how to implement dirty page tracking, not
to mention the missing features that are currently well used, like p2p
mappings, coherency tracking, mdev, etc.

It seems like quite an endeavor to fill all these gaps, while at the
same time QEMU will be working to move to use iommufd directly in order
to gain all the new features.

Where do we focus attention?  Is symlinking device files our proposal
to userspace and is that something achievable, or do we want to use
this compatibility interface as a means to test the interface and
allow userspace to make use of it for transition, if their use cases
allow it, perhaps eventually performing the symlink after deprecation
and eventual removal of the vfio container and type1 code?  Thanks,

Alex

___
iommu mailing list
iommu@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/iommu


Re: [PATCH RFC 08/12] iommufd: IOCTLs for the io_pagetable

2022-03-23 Thread Alex Williamson
On Wed, 23 Mar 2022 16:34:39 -0300
Jason Gunthorpe  wrote:

> On Wed, Mar 23, 2022 at 01:10:38PM -0600, Alex Williamson wrote:
> > On Fri, 18 Mar 2022 14:27:33 -0300
> > Jason Gunthorpe  wrote:
> >   
> > > +static int conv_iommu_prot(u32 map_flags)
> > > +{
> > > + int iommu_prot;
> > > +
> > > + /*
> > > +  * We provide no manual cache coherency ioctls to userspace and most
> > > +  * architectures make the CPU ops for cache flushing privileged.
> > > +  * Therefore we require the underlying IOMMU to support CPU coherent
> > > +  * operation.
> > > +  */
> > > + iommu_prot = IOMMU_CACHE;  
> > 
> > Where is this requirement enforced?  AIUI we'd need to test
> > IOMMU_CAP_CACHE_COHERENCY somewhere since functions like
> > intel_iommu_map() simply drop the flag when not supported by HW.  
> 
> You are right, the correct thing to do is to fail device
> binding/attach entirely if IOMMU_CAP_CACHE_COHERENCY is not there,
> however we can't do that because Intel abuses the meaning of
> IOMMU_CAP_CACHE_COHERENCY to mean their special no-snoop behavior is
> supported.
> 
> I want Intel to split out their special no-snoop from IOMMU_CACHE and
> IOMMU_CAP_CACHE_COHERENCY so these things have a consisent meaning in
> all iommu drivers. Once this is done vfio and iommufd should both
> always set IOMMU_CACHE and refuse to work without
> IOMMU_CAP_CACHE_COHERENCY. (unless someone knows of an !IOMMU_CACHE
> arch that does in fact work today with vfio, somehow, but I don't..)

IIRC, the DMAR on Intel CPUs dedicated to IGD was where we'd often see
lack of snoop-control support, causing us to have mixed coherent and
non-coherent domains.  I don't recall if you go back far enough in VT-d
history if the primary IOMMU might have lacked this support.  So I
think there are systems we care about with IOMMUs that can't enforce
DMA coherency.

As it is today, if the IOMMU reports IOMMU_CAP_CACHE_COHERENCY and all
mappings make use of IOMMU_CACHE, then all DMA is coherent.  Are you
suggesting IOMMU_CAP_CACHE_COHERENCY should indicate that all mappings
are coherent regardless of mapping protection flags?  What's the point
of IOMMU_CACHE at that point?

> I added a fixme about this.
> 
> > This also seems like an issue relative to vfio compatibility that I
> > don't see mentioned in that patch.  Thanks,  
> 
> Yes, it was missed in the notes for vfio compat that Intel no-snoop is
> not working currently, I fixed it.

Right, I see it in the comments relative to extensions, but missed in
the commit log.  Thanks,

Alex

___
iommu mailing list
iommu@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/iommu


Re: [PATCH RFC 08/12] iommufd: IOCTLs for the io_pagetable

2022-03-23 Thread Alex Williamson
On Fri, 18 Mar 2022 14:27:33 -0300
Jason Gunthorpe  wrote:

> +static int conv_iommu_prot(u32 map_flags)
> +{
> + int iommu_prot;
> +
> + /*
> +  * We provide no manual cache coherency ioctls to userspace and most
> +  * architectures make the CPU ops for cache flushing privileged.
> +  * Therefore we require the underlying IOMMU to support CPU coherent
> +  * operation.
> +  */
> + iommu_prot = IOMMU_CACHE;

Where is this requirement enforced?  AIUI we'd need to test
IOMMU_CAP_CACHE_COHERENCY somewhere since functions like
intel_iommu_map() simply drop the flag when not supported by HW.

This also seems like an issue relative to vfio compatibility that I
don't see mentioned in that patch.  Thanks,

Alex

> + if (map_flags & IOMMU_IOAS_MAP_WRITEABLE)
> + iommu_prot |= IOMMU_WRITE;
> + if (map_flags & IOMMU_IOAS_MAP_READABLE)
> + iommu_prot |= IOMMU_READ;
> + return iommu_prot;
> +}


___
iommu mailing list
iommu@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/iommu


Re: [PATCH RFC 10/12] iommufd: Add kAPI toward external drivers

2022-03-23 Thread Alex Williamson
On Fri, 18 Mar 2022 14:27:35 -0300
Jason Gunthorpe  wrote:
> +/**
> + * iommufd_bind_pci_device - Bind a physical device to an iommu fd
> + * @fd: iommufd file descriptor.
> + * @pdev: Pointer to a physical PCI device struct
> + * @id: Output ID number to return to userspace for this device
> + *
> + * A successful bind establishes an ownership over the device and returns
> + * struct iommufd_device pointer, otherwise returns error pointer.
> + *
> + * A driver using this API must set driver_managed_dma and must not touch
> + * the device until this routine succeeds and establishes ownership.
> + *
> + * Binding a PCI device places the entire RID under iommufd control.
> + *
> + * The caller must undo this with iommufd_unbind_device()
> + */
> +struct iommufd_device *iommufd_bind_pci_device(int fd, struct pci_dev *pdev,
> +u32 *id)
> +{
> + struct iommufd_device *idev;
> + struct iommufd_ctx *ictx;
> + struct iommu_group *group;
> + int rc;
> +
> + ictx = iommufd_fget(fd);
> + if (!ictx)
> + return ERR_PTR(-EINVAL);
> +
> + group = iommu_group_get(&pdev->dev);
> + if (!group) {
> + rc = -ENODEV;
> + goto out_file_put;
> + }
> +
> + /*
> +  * FIXME: Use a device-centric iommu api and this won't work with
> +  * multi-device groups
> +  */
> + rc = iommu_group_claim_dma_owner(group, ictx->filp);
> + if (rc)
> + goto out_group_put;
> +
> + idev = iommufd_object_alloc(ictx, idev, IOMMUFD_OBJ_DEVICE);
> + if (IS_ERR(idev)) {
> + rc = PTR_ERR(idev);
> + goto out_release_owner;
> + }
> + idev->ictx = ictx;
> + idev->dev = &pdev->dev;
> + /* The calling driver is a user until iommufd_unbind_device() */
> + refcount_inc(&idev->obj.users);
> + /* group refcount moves into iommufd_device */
> + idev->group = group;
> +
> + /*
> +  * If the caller fails after this success it must call
> +  * iommufd_unbind_device() which is safe since we hold this refcount.
> +  * This also means the device is a leaf in the graph and no other object
> +  * can take a reference on it.
> +  */
> + iommufd_object_finalize(ictx, &idev->obj);
> + *id = idev->obj.id;
> + return idev;
> +
> +out_release_owner:
> + iommu_group_release_dma_owner(group);
> +out_group_put:
> + iommu_group_put(group);
> +out_file_put:
> + fput(ictx->filp);
> + return ERR_PTR(rc);
> +}
> +EXPORT_SYMBOL_GPL(iommufd_bind_pci_device);

I'm stumped why this needs to be PCI specific.  Anything beyond the RID
comment?  Please enlighten.  Thanks,

Alex

___
iommu mailing list
iommu@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/iommu


Re: [PATCH RFC 07/12] iommufd: Data structure to provide IOVA to PFN mapping

2022-03-22 Thread Alex Williamson
On Fri, 18 Mar 2022 14:27:32 -0300
Jason Gunthorpe  wrote:
> +/*
> + * The area takes a slice of the pages from start_bytes to start_byte + 
> length
> + */
> +static struct iopt_area *
> +iopt_alloc_area(struct io_pagetable *iopt, struct iopt_pages *pages,
> + unsigned long iova, unsigned long start_byte,
> + unsigned long length, int iommu_prot, unsigned int flags)
> +{
> + struct iopt_area *area;
> + int rc;
> +
> + area = kzalloc(sizeof(*area), GFP_KERNEL);
> + if (!area)
> + return ERR_PTR(-ENOMEM);
> +
> + area->iopt = iopt;
> + area->iommu_prot = iommu_prot;
> + area->page_offset = start_byte % PAGE_SIZE;
> + area->pages_node.start = start_byte / PAGE_SIZE;
> + if (check_add_overflow(start_byte, length - 1, &area->pages_node.last))
> + return ERR_PTR(-EOVERFLOW);
> + area->pages_node.last = area->pages_node.last / PAGE_SIZE;
> + if (WARN_ON(area->pages_node.last >= pages->npages))
> + return ERR_PTR(-EOVERFLOW);

@area leaked in the above two error cases.

> +
> + down_write(&iopt->iova_rwsem);
> + if (flags & IOPT_ALLOC_IOVA) {
> + rc = iopt_alloc_iova(iopt, &iova,
> +  (uintptr_t)pages->uptr + start_byte,
> +  length);
> + if (rc)
> + goto out_unlock;
> + }
> +
> + if (check_add_overflow(iova, length - 1, &area->node.last)) {
> + rc = -EOVERFLOW;
> + goto out_unlock;
> + }
> +
> + if (!(flags & IOPT_ALLOC_IOVA)) {
> + if ((iova & (iopt->iova_alignment - 1)) ||
> + (length & (iopt->iova_alignment - 1)) || !length) {
> + rc = -EINVAL;
> + goto out_unlock;
> + }
> +
> + /* No reserved IOVA intersects the range */
> + if (interval_tree_iter_first(&iopt->reserved_iova_itree, iova,
> +  area->node.last)) {
> + rc = -ENOENT;
> + goto out_unlock;
> + }
> +
> + /* Check that there is not already a mapping in the range */
> + if (iopt_area_iter_first(iopt, iova, area->node.last)) {
> + rc = -EADDRINUSE;
> + goto out_unlock;
> + }
> + }
> +
> + /*
> +  * The area is inserted with a NULL pages indicating it is not fully
> +  * initialized yet.
> +  */
> + area->node.start = iova;
> + interval_tree_insert(&area->node, &area->iopt->area_itree);
> + up_write(&iopt->iova_rwsem);
> + return area;
> +
> +out_unlock:
> + up_write(&iopt->iova_rwsem);
> + kfree(area);
> + return ERR_PTR(rc);
> +}
...
> +/**
> + * iopt_access_pages() - Return a list of pages under the iova
> + * @iopt: io_pagetable to act on
> + * @iova: Starting IOVA
> + * @length: Number of bytes to access
> + * @out_pages: Output page list
> + * @write: True if access is for writing
> + *
> + * Reads @npages starting at iova and returns the struct page * pointers. 
> These
> + * can be kmap'd by the caller for CPU access.
> + *
> + * The caller must perform iopt_unaccess_pages() when done to balance this.
> + *
> + * iova can be unaligned from PAGE_SIZE. The first returned byte starts at
> + * page_to_phys(out_pages[0]) + (iova % PAGE_SIZE). The caller promises not 
> to
> + * touch memory outside the requested iova slice.
> + *
> + * FIXME: callers that need a DMA mapping via a sgl should create another
> + * interface to build the SGL efficiently
> + */
> +int iopt_access_pages(struct io_pagetable *iopt, unsigned long iova,
> +   unsigned long length, struct page **out_pages, bool write)
> +{
> + unsigned long cur_iova = iova;
> + unsigned long last_iova;
> + struct iopt_area *area;
> + int rc;
> +
> + if (!length)
> + return -EINVAL;
> + if (check_add_overflow(iova, length - 1, &last_iova))
> + return -EOVERFLOW;
> +
> + down_read(&iopt->iova_rwsem);
> + for (area = iopt_area_iter_first(iopt, iova, last_iova); area;
> +  area = iopt_area_iter_next(area, iova, last_iova)) {
> + unsigned long last = min(last_iova, iopt_area_last_iova(area));
> + unsigned long last_index;
> + unsigned long index;
> +
> + /* Need contiguous areas in the access */
> + if (iopt_area_iova(area) < cur_iova || !area->pages) {
^^^
Should this be (cur_iova != iova && iopt_area_iova(area) < cur_iova)?

I can't see how we'd require in-kernel page users to know the iopt_area
alignment from userspace, so I think this needs to skip the first
iteration.  Thanks,

Alex

> + rc = -EINVAL;
> + goto out_remove;
> + }
> +
> + index = iopt_area_iova_to_index(area, cur_i

Re: [PATCH RFC 04/12] kernel/user: Allow user::locked_vm to be usable for iommufd

2022-03-22 Thread Alex Williamson
On Tue, 22 Mar 2022 11:57:41 -0300
Jason Gunthorpe  wrote:

> On Tue, Mar 22, 2022 at 03:28:22PM +0100, Niklas Schnelle wrote:
> > On Fri, 2022-03-18 at 14:27 -0300, Jason Gunthorpe wrote:  
> > > 
> > > user->locked_vm is the correct accounting to use for ulimit because it is
> > > per-user, and the ulimit is not supposed to be per-process. Other
> > > places (vfio, vdpa and infiniband) have used mm->pinned_vm and/or
> > > mm->locked_vm for accounting pinned pages, but this is only per-process
> > > and inconsistent with the majority of the kernel.  
> > 
> > Since this will replace parts of vfio this difference seems
> > significant. Can you explain this a bit more?  
> 
> I'm not sure what to say more, this is the correct way to account for
> this. It is natural to see it is right because the ulimit is supposted
> to be global to the user, not effectively reset every time the user
> creates a new process.
> 
> So checking the ulimit against a per-process variable in the mm_struct
> doesn't make too much sense.

I'm still picking my way through the series, but the later compat
interface doesn't mention this difference as an outstanding issue.
Doesn't this difference need to be accounted in how libvirt manages VM
resource limits?  AIUI libvirt uses some form of prlimit(2) to set
process locked memory limits.  A compat interface might be an
interesting feature, but does it only provide ioctl compatibility and
not resource ulimit compatibility?  Thanks,

Alex

___
iommu mailing list
iommu@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/iommu


Re: [PATCH v4 15/32] vfio: introduce KVM-owned IOMMU type

2022-03-14 Thread Alex Williamson
On Mon, 14 Mar 2022 15:44:34 -0400
Matthew Rosato  wrote:

> s390x will introduce a new IOMMU domain type where the mappings are
> managed by KVM rather than in response to userspace mapping ioctls.  Allow
> for specifying this type on the VFIO_SET_IOMMU ioctl and triggering the
> appropriate iommu interface for overriding the default domain.
> 
> Signed-off-by: Matthew Rosato 
> ---
>  drivers/vfio/vfio_iommu_type1.c | 12 +++-
>  include/uapi/linux/vfio.h   |  6 ++
>  2 files changed, 17 insertions(+), 1 deletion(-)
> 
> diff --git a/drivers/vfio/vfio_iommu_type1.c b/drivers/vfio/vfio_iommu_type1.c
> index 9394aa9444c1..0bec97077d61 100644
> --- a/drivers/vfio/vfio_iommu_type1.c
> +++ b/drivers/vfio/vfio_iommu_type1.c
> @@ -77,6 +77,7 @@ struct vfio_iommu {
>   boolnesting;
>   booldirty_page_tracking;
>   boolcontainer_open;
> + boolkvm;
>   struct list_heademulated_iommu_groups;
>  };
>  
> @@ -2203,7 +2204,12 @@ static int vfio_iommu_type1_attach_group(void 
> *iommu_data,
>   goto out_free_group;
>  
>   ret = -EIO;
> - domain->domain = iommu_domain_alloc(bus);
> +
> + if (iommu->kvm)
> + domain->domain = iommu_domain_alloc_type(bus, IOMMU_DOMAIN_KVM);
> + else
> + domain->domain = iommu_domain_alloc(bus);
> +
>   if (!domain->domain)
>   goto out_free_domain;
>  
> @@ -2552,6 +2558,9 @@ static void *vfio_iommu_type1_open(unsigned long arg)
>   case VFIO_TYPE1v2_IOMMU:
>   iommu->v2 = true;
>   break;
> + case VFIO_KVM_IOMMU:
> + iommu->kvm = true;
> + break;
>   default:
>   kfree(iommu);
>   return ERR_PTR(-EINVAL);
> @@ -2637,6 +2646,7 @@ static int vfio_iommu_type1_check_extension(struct 
> vfio_iommu *iommu,
>   case VFIO_TYPE1_NESTING_IOMMU:
>   case VFIO_UNMAP_ALL:
>   case VFIO_UPDATE_VADDR:
> + case VFIO_KVM_IOMMU:
>   return 1;
>   case VFIO_DMA_CC_IOMMU:
>   if (!iommu)
> diff --git a/include/uapi/linux/vfio.h b/include/uapi/linux/vfio.h
> index ef33ea002b0b..666edb6957ac 100644
> --- a/include/uapi/linux/vfio.h
> +++ b/include/uapi/linux/vfio.h
> @@ -52,6 +52,12 @@
>  /* Supports the vaddr flag for DMA map and unmap */
>  #define VFIO_UPDATE_VADDR10
>  
> +/*
> + * The KVM_IOMMU type implies that the hypervisor will control the mappings
> + * rather than userspace
> + */
> +#define VFIO_KVM_IOMMU   11

Then why is this hosted in the type1 code that exposes a wide variety
of userspace interfaces?  Thanks,

Alex

___
iommu mailing list
iommu@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/iommu


Re: [PATCH v7 10/11] vfio: Remove iommu group notifier

2022-02-28 Thread Alex Williamson
On Mon, 28 Feb 2022 08:50:55 +0800
Lu Baolu  wrote:

> The iommu core and driver core have been enhanced to avoid unsafe driver
> binding to a live group after iommu_group_set_dma_owner(PRIVATE_USER)
> has been called. There's no need to register iommu group notifier. This
> removes the iommu group notifer which contains BUG_ON() and WARN().
> 
> Signed-off-by: Lu Baolu 
> Reviewed-by: Jason Gunthorpe 
> ---
>  drivers/vfio/vfio.c | 147 
>  1 file changed, 147 deletions(-)

Acked-by: Alex Williamson 

___
iommu mailing list
iommu@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/iommu


Re: [PATCH v7 09/11] vfio: Delete the unbound_list

2022-02-28 Thread Alex Williamson
On Mon, 28 Feb 2022 08:50:54 +0800
Lu Baolu  wrote:

> From: Jason Gunthorpe 
> 
> commit 60720a0fc646 ("vfio: Add device tracking during unbind") added the
> unbound list to plug a problem with KVM where KVM_DEV_VFIO_GROUP_DEL
> relied on vfio_group_get_external_user() succeeding to return the
> vfio_group from a group file descriptor. The unbound list allowed
> vfio_group_get_external_user() to continue to succeed in edge cases.
> 
> However commit 5d6dee80a1e9 ("vfio: New external user group/file match")
> deleted the call to vfio_group_get_external_user() during
> KVM_DEV_VFIO_GROUP_DEL. Instead vfio_external_group_match_file() is used
> to directly match the file descriptor to the group pointer.
> 
> This in turn avoids the call down to vfio_dev_viable() during
> KVM_DEV_VFIO_GROUP_DEL and also avoids the trouble the first commit was
> trying to fix.
> 
> There are no other users of vfio_dev_viable() that care about the time
> after vfio_unregister_group_dev() returns, so simply delete the
> unbound_list entirely.
> 
> Reviewed-by: Chaitanya Kulkarni 
> Signed-off-by: Jason Gunthorpe 
> Signed-off-by: Lu Baolu 
> ---
>  drivers/vfio/vfio.c | 74 ++-------
>  1 file changed, 2 insertions(+), 72 deletions(-)

Acked-by: Alex Williamson 

___
iommu mailing list
iommu@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/iommu


Re: [PATCH v7 08/11] vfio: Remove use of vfio_group_viable()

2022-02-28 Thread Alex Williamson
On Mon, 28 Feb 2022 08:50:53 +0800
Lu Baolu  wrote:

> As DMA ownership is claimed for the iommu group when a VFIO group is
> added to a VFIO container, the VFIO group viability is guaranteed as long
> as group->container_users > 0. Remove those unnecessary group viability
> checks which are only hit when group->container_users is not zero.
> 
> The only remaining reference is in GROUP_GET_STATUS, which could be called
> at any time when group fd is valid. Here we just replace the
> vfio_group_viable() by directly calling IOMMU core to get viability status.
> 
> Signed-off-by: Lu Baolu 
> Reviewed-by: Jason Gunthorpe 
> ---
>  drivers/vfio/vfio.c | 18 ++
>  1 file changed, 6 insertions(+), 12 deletions(-)

Acked-by: Alex Williamson 

___
iommu mailing list
iommu@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/iommu


Re: [PATCH v7 07/11] vfio: Set DMA ownership for VFIO devices

2022-02-28 Thread Alex Williamson
On Mon, 28 Feb 2022 08:50:52 +0800
Lu Baolu  wrote:

> Claim group dma ownership when an IOMMU group is set to a container,
> and release the dma ownership once the iommu group is unset from the
> container.
> 
> This change disallows some unsafe bridge drivers to bind to non-ACS
> bridges while devices under them are assigned to user space. This is an
> intentional enhancement and possibly breaks some existing
> configurations. The recommendation to such an affected user would be
> that the previously allowed host bridge driver was unsafe for this use
> case and to continue to enable assignment of devices within that group,
> the driver should be unbound from the bridge device or replaced with the
> pci-stub driver.
> 
> For any bridge driver, we consider it unsafe if it satisfies any of the
> following conditions:
> 
>   1) The bridge driver uses DMA. Calling pci_set_master() or calling any
>  kernel DMA API (dma_map_*() and etc.) is an indicate that the
>  driver is doing DMA.
> 
>   2) If the bridge driver uses MMIO, it should be tolerant to hostile
>  userspace also touching the same MMIO registers via P2P DMA
>  attacks.
> 
> If the bridge driver turns out to be a safe one, it could be used as
> before by setting the driver's .driver_managed_dma field, just like what
> we have done in the pcieport driver.
> 
> Signed-off-by: Lu Baolu 
> Reviewed-by: Jason Gunthorpe 
> ---
>  drivers/vfio/fsl-mc/vfio_fsl_mc.c |  1 +
>  drivers/vfio/pci/vfio_pci.c   |  1 +
>  drivers/vfio/platform/vfio_amba.c |  1 +
>  drivers/vfio/platform/vfio_platform.c |  1 +
>  drivers/vfio/vfio.c   | 10 +-
>  5 files changed, 13 insertions(+), 1 deletion(-)

Acked-by: Alex Williamson 

___
iommu mailing list
iommu@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/iommu


Re: [PATCH v6 10/11] vfio: Remove iommu group notifier

2022-02-23 Thread Alex Williamson
On Fri, 18 Feb 2022 08:55:20 +0800
Lu Baolu  wrote:

> The iommu core and driver core have been enhanced to avoid unsafe driver
> binding to a live group after iommu_group_set_dma_owner(PRIVATE_USER)
> has been called. There's no need to register iommu group notifier. This
> removes the iommu group notifer which contains BUG_ON() and WARN().
> 
> The commit 5f096b14d421b ("vfio: Whitelist PCI bridges") allowed all
> pcieport drivers to be bound with devices while the group is assigned to
> user space. This is not always safe. For example, The shpchp_core driver
> relies on the PCI MMIO access for the controller functionality. With its
> downstream devices assigned to the userspace, the MMIO might be changed
> through user initiated P2P accesses without any notification. This might
> break the kernel driver integrity and lead to some unpredictable
> consequences. As the result, currently we only allow the portdrv driver.
> 
> For any bridge driver, in order to avoiding default kernel DMA ownership
> claiming, we should consider:
> 
>  1) Does the bridge driver use DMA? Calling pci_set_master() or
> a dma_map_* API is a sure indicate the driver is doing DMA
> 
>  2) If the bridge driver uses MMIO, is it tolerant to hostile
> userspace also touching the same MMIO registers via P2P DMA
> attacks?
> 
> Conservatively if the driver maps an MMIO region at all, we can say that
> it fails the test.

IIUC, there's a chance we're going to break user configurations if
they're assigning devices from a group containing a bridge that uses a
driver other than pcieport.  The recommendation to such an affected user
would be that the previously allowed host bridge driver was unsafe for
this use case and to continue to enable assignment of devices within
that group, the driver should be unbound from the bridge device or
replaced with the pci-stub driver.  Is that right?

Unfortunately I also think a bisect of such a breakage wouldn't land
here, I think it was actually broken in "vfio: Set DMA ownership for
VFIO" since that's where vfio starts to make use of
iommu_group_claim_dma_owner() which should fail due to
pci_dma_configure() calling iommu_device_use_default_domain() for
any driver not identifying itself as driver_managed_dma.

If that's correct, can we leave a breadcrumb in the correct commit log
indicating why this potential breakage is intentional and how the
bridge driver might be reconfigured to continue to allow assignment from
within the group more safely?  Thanks,

Alex

___
iommu mailing list
iommu@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/iommu


[PATCH] iommu/vt-d: Fix unmap_pages support

2021-11-10 Thread Alex Williamson
When supporting only the .map and .unmap callbacks of iommu_ops,
the IOMMU driver can make assumptions about the size and alignment
used for mappings based on the driver provided pgsize_bitmap.  VT-d
previously used essentially PAGE_MASK for this bitmap as any power
of two mapping was acceptably filled by native page sizes.

However, with the .map_pages and .unmap_pages interface we're now
getting page-size and count arguments.  If we simply combine these
as (page-size * count) and make use of the previous map/unmap
functions internally, any size and alignment assumptions are very
different.

As an example, a given vfio device assignment VM will often create
a 4MB mapping at IOVA pfn [0x3fe00 - 0x401ff].  On a system that
does not support IOMMU super pages, the unmap_pages interface will
ask to unmap 1024 4KB pages at the base IOVA.  dma_pte_clear_level()
will recurse down to level 2 of the page table where the first half
of the pfn range exactly matches the entire pte level.  We clear the
pte, increment the pfn by the level size, but (oops) the next pte is
on a new page, so we exit the loop an pop back up a level.  When we
then update the pfn based on that higher level, we seem to assume
that the previous pfn value was at the start of the level.  In this
case the level size is 256K pfns, which we add to the base pfn and
get a results of 0x7fe00, which is clearly greater than 0x401ff,
so we're done.  Meanwhile we never cleared the ptes for the remainder
of the range.  When the VM remaps this range, we're overwriting valid
ptes and the VT-d driver complains loudly, as reported by the user
report linked below.

The fix for this seems relatively simple, if each iteration of the
loop in dma_pte_clear_level() is assumed to clear to the end of the
level pte page, then our next pfn should be calculated from level_pfn
rather than our working pfn.

Fixes: 3f34f1259776 ("iommu/vt-d: Implement map/unmap_pages() iommu_ops 
callback")
Reported-by: Ajay Garg 
Link: https://lore.kernel.org/all/20211002124012.18186-1-ajaygargn...@gmail.com/
Signed-off-by: Alex Williamson 
---
 drivers/iommu/intel/iommu.c |2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/iommu/intel/iommu.c b/drivers/iommu/intel/iommu.c
index d75f59ae28e6..f6395f5425f0 100644
--- a/drivers/iommu/intel/iommu.c
+++ b/drivers/iommu/intel/iommu.c
@@ -1249,7 +1249,7 @@ static struct page *dma_pte_clear_level(struct 
dmar_domain *domain, int level,
   freelist);
}
 next:
-   pfn += level_size(level);
+   pfn = level_pfn + level_size(level);
} while (!first_pte_in_page(++pte) && pfn <= last_pfn);
 
if (first_pte)


___
iommu mailing list
iommu@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/iommu


Re: [PATCH v2] iommu: intel: do deep dma-unmapping, to avoid kernel-flooding.

2021-11-09 Thread Alex Williamson


Hi Baolu,

Have you looked into this?  I'm able to reproduce by starting and
destroying an assigned device VM several times.  It seems like it came
in with Joerg's pull request for the v5.15 merge window.  Bisecting
lands me on 3f34f1259776 where intel-iommu added map/unmap_pages
support, but I'm not convinced that isn't an artifact that the regular
map/unmap calls had been been simplified to only be used for single
pages by that point.  If I mask the map/unmap_pages callbacks and use
map/unmap with (pgsize * size) and restore the previous pgsize_bitmap,
I can generate the same faults.  So maybe the root issue was introduced
somewhere else, or perhaps it is a latent bug in clearing of pte ranges
as Ajay proposes below.  In any case, I think there's a real issue
here.  Thanks,

Alex

On Tue, 12 Oct 2021 19:26:53 +0530
Ajay Garg  wrote:

> === Issue ===
> 
> Kernel-flooding is seen, when an x86_64 L1 guest (Ubuntu-21) is booted in 
> qemu/kvm
> on a x86_64 host (Ubuntu-21), with a host-pci-device attached.
> 
> Following kind of logs, along with the stacktraces, cause the flood :
> 
> ..
>  DMAR: ERROR: DMA PTE for vPFN 0x428ec already set (to 3f6ec003 not 3f6ec003)
>  DMAR: ERROR: DMA PTE for vPFN 0x428ed already set (to 3f6ed003 not 3f6ed003)
>  DMAR: ERROR: DMA PTE for vPFN 0x428ee already set (to 3f6ee003 not 3f6ee003)
>  DMAR: ERROR: DMA PTE for vPFN 0x428ef already set (to 3f6ef003 not 3f6ef003)
>  DMAR: ERROR: DMA PTE for vPFN 0x428f0 already set (to 3f6f0003 not 3f6f0003)
> ..
> 
> 
> 
> === Current Behaviour, leading to the issue ===
> 
> Currently, when we do a dma-unmapping, we unmap/unlink the mappings, but
> the pte-entries are not cleared.
> 
> Thus, following sequencing would flood the kernel-logs :
> 
> i)
> A dma-unmapping makes the real/leaf-level pte-slot invalid, but the 
> pte-content itself is not cleared.
> 
> ii)
> Now, during some later dma-mapping procedure, as the pte-slot is about
> to hold a new pte-value, the intel-iommu checks if a prior 
> pte-entry exists in the pte-slot. If it exists, it logs a kernel-error,
> along with a corresponding stacktrace.
> 
> iii)
> Step ii) runs in abundance, and the kernel-logs run insane.
> 
> 
> 
> === Fix ===
> 
> We ensure that as part of a dma-unmapping, each (unmapped) pte-slot
> is also cleared of its value/content (at the leaf-level, where the 
> real mapping from a iova => pfn mapping is stored). 
> 
> This completes a "deep" dma-unmapping.
> 
> 
> 
> Signed-off-by: Ajay Garg 
> ---
>  drivers/iommu/intel/iommu.c | 2 ++
>  1 file changed, 2 insertions(+)
> 
> diff --git a/drivers/iommu/intel/iommu.c b/drivers/iommu/intel/iommu.c
> index d75f59ae28e6..485a8ea71394 100644
> --- a/drivers/iommu/intel/iommu.c
> +++ b/drivers/iommu/intel/iommu.c
> @@ -5090,6 +5090,8 @@ static size_t intel_iommu_unmap(struct iommu_domain 
> *domain,
>   gather->freelist = domain_unmap(dmar_domain, start_pfn,
>   last_pfn, gather->freelist);
>  
> + dma_pte_clear_range(dmar_domain, start_pfn, last_pfn);
> +
>   if (dmar_domain->max_addr == iova + size)
>   dmar_domain->max_addr = iova;
>  

___
iommu mailing list
iommu@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/iommu


Re: [PATCH] iommu: intel: remove flooding of non-error logs, when new-DMA-PTE is the same as old-DMA-PTE.

2021-10-11 Thread Alex Williamson
On Mon, 11 Oct 2021 11:49:33 +0530
Ajay Garg  wrote:

> The flooding was seen today again, after I booted the host-machine in
> the morning.
> Need to look what the heck is going on ...
> 
> On Sun, Oct 10, 2021 at 11:45 AM Ajay Garg  wrote:
> >  
> > > I'll try and backtrack to the userspace process that is sending these 
> > > ioctls.
> > >  
> >
> > The userspace process is qemu.
> >
> > I compiled qemu from latest source, installed via "sudo make install"
> > on host-machine, rebooted the host-machine, and booted up the
> > guest-machine on the host-machine. Now, no kernel-flooding is seen on
> > the host-machine.
> >
> > For me, the issue is thus closed-invalid; admins may take the
> > necessary action to officially mark ;)

Even this QEMU explanation doesn't make a lot of sense, vfio tracks
userspace mappings and will return an -EEXIST error for duplicate or
overlapping IOVA entries.  We expect to have an entirely empty IOMMU
domain when a device is assigned, but it seems the only way userspace
can trigger duplicate PTEs would be if mappings already exist, or we
have a bug somewhere.

If the most recent instance is purely on bare metal, then it seems the
host itself has conflicting mappings.  I can only speculate with the
limited data presented, but I'm suspicious there's something happening
with RMRRs here (but that should also entirely preclude assignment).
dmesg, lspci -vvv, and VM configuration would be useful.  Thanks,

Alex

___
iommu mailing list
iommu@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/iommu


Re: [PATCH] iommu: intel: remove flooding of non-error logs, when new-DMA-PTE is the same as old-DMA-PTE.

2021-10-04 Thread Alex Williamson
On Sat, 2 Oct 2021 22:48:24 +0530
Ajay Garg  wrote:

> Thanks Lu for the reply.
> 
> >
> > Isn't the domain should be switched from a default domain to an
> > unmanaged domain when the device is assigned to the guest?
> >
> > Even you want to r-setup the same mappings, you need to un-map all
> > existing mappings, right?
> >  
> 
> Hmm, I guess that's a (design) decision the KVM/QEMU/VFIO communities
> need to take.
> May be the patch could suppress the flooding till then?

No, this is wrong.  The pte values should not exist, it doesn't matter
that they're the same.  Is the host driver failing to remove mappings
and somehow they persist in the new vfio owned domain?  There's
definitely a bug beyond logging going on here.  Thanks,

Alex

___
iommu mailing list
iommu@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/iommu


Re: [RFC 02/20] vfio: Add device class for /dev/vfio/devices

2021-09-29 Thread Alex Williamson
On Wed, 29 Sep 2021 12:08:59 +1000
David Gibson  wrote:

> On Sun, Sep 19, 2021 at 02:38:30PM +0800, Liu Yi L wrote:
> > This patch introduces a new interface (/dev/vfio/devices/$DEVICE) for
> > userspace to directly open a vfio device w/o relying on container/group
> > (/dev/vfio/$GROUP). Anything related to group is now hidden behind
> > iommufd (more specifically in iommu core by this RFC) in a device-centric
> > manner.
> > 
> > In case a device is exposed in both legacy and new interfaces (see next
> > patch for how to decide it), this patch also ensures that when the device
> > is already opened via one interface then the other one must be blocked.
> > 
> > Signed-off-by: Liu Yi L   
> [snip]
> 
> > +static bool vfio_device_in_container(struct vfio_device *device)
> > +{
> > +   return !!(device->group && device->group->container);  
> 
> You don't need !! here.  && is already a logical operation, so returns
> a valid bool.
> 
> > +}
> > +
> >  static int vfio_device_fops_release(struct inode *inode, struct file 
> > *filep)
> >  {
> > struct vfio_device *device = filep->private_data;
> > @@ -1560,7 +1691,16 @@ static int vfio_device_fops_release(struct inode 
> > *inode, struct file *filep)
> >  
> > module_put(device->dev->driver->owner);
> >  
> > -   vfio_group_try_dissolve_container(device->group);
> > +   if (vfio_device_in_container(device)) {
> > +   vfio_group_try_dissolve_container(device->group);
> > +   } else {
> > +   atomic_dec(&device->opened);
> > +   if (device->group) {
> > +   mutex_lock(&device->group->opened_lock);
> > +   device->group->opened--;
> > +   mutex_unlock(&device->group->opened_lock);
> > +   }
> > +   }
> >  
> > vfio_device_put(device);
> >  
> > @@ -1613,6 +1753,7 @@ static int vfio_device_fops_mmap(struct file *filep, 
> > struct vm_area_struct *vma)
> >  
> >  static const struct file_operations vfio_device_fops = {
> > .owner  = THIS_MODULE,
> > +   .open   = vfio_device_fops_open,
> > .release= vfio_device_fops_release,
> > .read   = vfio_device_fops_read,
> > .write  = vfio_device_fops_write,
> > @@ -2295,6 +2436,52 @@ static struct miscdevice vfio_dev = {
> > .mode = S_IRUGO | S_IWUGO,
> >  };
> >  
> > +static char *vfio_device_devnode(struct device *dev, umode_t *mode)
> > +{
> > +   return kasprintf(GFP_KERNEL, "vfio/devices/%s", dev_name(dev));  
> 
> Others have pointed out some problems with the use of dev_name()
> here.  I'll add that I think you'll make things much easier if instead
> of using one huge "devices" subdir, you use a separate subdir for each
> vfio sub-driver (so, one for PCI, one for each type of mdev, one for
> platform, etc.).  That should make avoiding name conflicts a lot simpler.

It seems like this is unnecessary if we use the vfioX naming approach.
Conflicts are trivial to ignore if we don't involve dev_name() and
looking for the correct major:minor chardev in the correct subdirectory
seems like a hassle for userspace.  Thanks,

Alex

___
iommu mailing list
iommu@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/iommu


Re: [RFC 03/20] vfio: Add vfio_[un]register_device()

2021-09-22 Thread Alex Williamson
On Wed, 22 Sep 2021 22:34:42 +
"Tian, Kevin"  wrote:

> > From: Alex Williamson 
> > Sent: Thursday, September 23, 2021 4:11 AM
> > 
> > On Wed, 22 Sep 2021 09:22:52 -0300
> > Jason Gunthorpe  wrote:
> >   
> > > On Wed, Sep 22, 2021 at 09:23:34AM +, Tian, Kevin wrote:
> > >  
> > > > > Providing an ioctl to bind to a normal VFIO container or group might
> > > > > allow a reasonable fallback in userspace..  
> > > >
> > > > I didn't get this point though. An error in binding already allows the
> > > > user to fall back to the group path. Why do we need introduce another
> > > > ioctl to explicitly bind to container via the nongroup interface?  
> > >
> > > New userspace still needs a fallback path if it hits the 'try and
> > > fail'. Keeping the device FD open and just using a different ioctl to
> > > bind to a container/group FD, which new userspace can then obtain as a
> > > fallback, might be OK.
> > >
> > > Hard to see without going through the qemu parts, so maybe just keep
> > > it in mind  
> > 
> > If we assume that the container/group/device interface is essentially
> > deprecated once we have iommufd, it doesn't make a lot of sense to me
> > to tack on a container/device interface just so userspace can avoid
> > reverting to the fully legacy interface.
> > 
> > But why would we create vfio device interface files at all if they
> > can't work?  I'm not really on board with creating a try-and-fail
> > interface for a mechanism that cannot work for a given device.  The
> > existence of the device interface should indicate that it's supported.
> > Thanks,
> >   
> 
> Now it's a try-and-fail model even for devices which support iommufd.
> Per Jason's suggestion, a device is always opened with a parked fops
> which supports only bind. Binding serves as the contract for handling
> exclusive ownership on a device and switching to normal fops if
> succeed. So the user has to try-and-fail in case multiple threads attempt 
> to open a same device. Device which doesn't support iommufd is not
> different, except binding request 100% fails (due to missing .bind_iommufd
> in kernel driver).

That's a rather important difference.  I don't really see how that's
comparable to the mutually exclusive nature of the legacy vs device
interface.  We're not going to present a vfio device interface for SW
mdevs that can't participate in iommufd, right?  Thanks,

Alex

___
iommu mailing list
iommu@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/iommu


Re: [RFC 10/20] iommu/iommufd: Add IOMMU_DEVICE_GET_INFO

2021-09-22 Thread Alex Williamson
On Sun, 19 Sep 2021 14:38:38 +0800
Liu Yi L  wrote:

> +struct iommu_device_info {
> + __u32   argsz;
> + __u32   flags;
> +#define IOMMU_DEVICE_INFO_ENFORCE_SNOOP  (1 << 0) /* IOMMU enforced 
> snoop */

Is this too PCI specific, or perhaps too much of the mechanism rather
than the result?  ie. should we just indicate if the IOMMU guarantees
coherent DMA?  Thanks,

Alex

> +#define IOMMU_DEVICE_INFO_PGSIZES(1 << 1) /* supported page sizes */
> +#define IOMMU_DEVICE_INFO_ADDR_WIDTH (1 << 2) /* addr_wdith field valid */
> + __u64   dev_cookie;
> + __u64   pgsize_bitmap;
> + __u32   addr_width;
> +};
> +
> +#define IOMMU_DEVICE_GET_INFO_IO(IOMMU_TYPE, IOMMU_BASE + 1)
>  
>  #define IOMMU_FAULT_PERM_READ(1 << 0) /* read */
>  #define IOMMU_FAULT_PERM_WRITE   (1 << 1) /* write */

___
iommu mailing list
iommu@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/iommu


Re: [RFC 05/20] vfio/pci: Register device to /dev/vfio/devices

2021-09-22 Thread Alex Williamson
On Wed, 22 Sep 2021 01:19:08 +
"Tian, Kevin"  wrote:

> > From: Alex Williamson 
> > Sent: Wednesday, September 22, 2021 5:09 AM
> > 
> > On Tue, 21 Sep 2021 13:40:01 -0300
> > Jason Gunthorpe  wrote:
> >   
> > > On Sun, Sep 19, 2021 at 02:38:33PM +0800, Liu Yi L wrote:  
> > > > This patch exposes the device-centric interface for vfio-pci devices. To
> > > > be compatiable with existing users, vfio-pci exposes both legacy group
> > > > interface and device-centric interface.
> > > >
> > > > As explained in last patch, this change doesn't apply to devices which
> > > > cannot be forced to snoop cache by their upstream iommu. Such devices
> > > > are still expected to be opened via the legacy group interface.  
> > 
> > This doesn't make much sense to me.  The previous patch indicates
> > there's work to be done in updating the kvm-vfio contract to understand
> > DMA coherency, so you're trying to limit use cases to those where the
> > IOMMU enforces coherency, but there's QEMU work to be done to support
> > the iommufd uAPI at all.  Isn't part of that work to understand how KVM
> > will be told about non-coherent devices rather than "meh, skip it in the
> > kernel"?  Also let's not forget that vfio is not only for KVM.  
> 
> The policy here is that VFIO will not expose such devices (no enforce-snoop)
> in the new device hierarchy at all. In this case QEMU will fall back to the
> group interface automatically and then rely on the existing contract to 
> connect 
> vfio and QEMU. It doesn't need to care about the whatever new contract
> until such devices are exposed in the new interface.
> 
> yes, vfio is not only for KVM. But here it's more a task split based on 
> staging
> consideration. imo it's not necessary to further split task into supporting
> non-snoop device for userspace driver and then for kvm.

Patch 10 introduces an iommufd interface for QEMU to learn whether the
IOMMU enforces DMA coherency, at that point QEMU could revert to the
legacy interface, or register the iommufd with KVM, or otherwise
establish non-coherent DMA with KVM as necessary.  We're adding cruft
to the kernel here to enforce an unnecessary limitation.

If there are reasons the kernel can't support the device interface,
that's a valid reason not to present the interface, but this seems like
picking a specific gap that userspace is already able to detect from
this series at the expense of other use cases.  Thanks,

Alex

___
iommu mailing list
iommu@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/iommu


Re: [RFC 08/20] vfio/pci: Add VFIO_DEVICE_BIND_IOMMUFD

2021-09-22 Thread Alex Williamson
On Tue, 21 Sep 2021 14:29:39 -0300
Jason Gunthorpe  wrote:

> On Sun, Sep 19, 2021 at 02:38:36PM +0800, Liu Yi L wrote:
> > +struct vfio_device_iommu_bind_data {
> > +   __u32   argsz;
> > +   __u32   flags;
> > +   __s32   iommu_fd;
> > +   __u64   dev_cookie;  
> 
> Missing explicit padding
> 
> Always use __aligned_u64 in uapi headers, fix all the patches.

We don't need padding or explicit alignment if we just swap the order
of iommu_fd and dev_cookie.  Thanks,

Alex

___
iommu mailing list
iommu@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/iommu


Re: [RFC 03/20] vfio: Add vfio_[un]register_device()

2021-09-22 Thread Alex Williamson
On Wed, 22 Sep 2021 09:22:52 -0300
Jason Gunthorpe  wrote:

> On Wed, Sep 22, 2021 at 09:23:34AM +, Tian, Kevin wrote:
> 
> > > Providing an ioctl to bind to a normal VFIO container or group might
> > > allow a reasonable fallback in userspace..  
> > 
> > I didn't get this point though. An error in binding already allows the
> > user to fall back to the group path. Why do we need introduce another
> > ioctl to explicitly bind to container via the nongroup interface?   
> 
> New userspace still needs a fallback path if it hits the 'try and
> fail'. Keeping the device FD open and just using a different ioctl to
> bind to a container/group FD, which new userspace can then obtain as a
> fallback, might be OK.
> 
> Hard to see without going through the qemu parts, so maybe just keep
> it in mind

If we assume that the container/group/device interface is essentially
deprecated once we have iommufd, it doesn't make a lot of sense to me
to tack on a container/device interface just so userspace can avoid
reverting to the fully legacy interface.

But why would we create vfio device interface files at all if they
can't work?  I'm not really on board with creating a try-and-fail
interface for a mechanism that cannot work for a given device.  The
existence of the device interface should indicate that it's supported.
Thanks,

Alex

___
iommu mailing list
iommu@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/iommu


Re: [RFC 05/20] vfio/pci: Register device to /dev/vfio/devices

2021-09-21 Thread Alex Williamson
On Tue, 21 Sep 2021 13:40:01 -0300
Jason Gunthorpe  wrote:

> On Sun, Sep 19, 2021 at 02:38:33PM +0800, Liu Yi L wrote:
> > This patch exposes the device-centric interface for vfio-pci devices. To
> > be compatiable with existing users, vfio-pci exposes both legacy group
> > interface and device-centric interface.
> > 
> > As explained in last patch, this change doesn't apply to devices which
> > cannot be forced to snoop cache by their upstream iommu. Such devices
> > are still expected to be opened via the legacy group interface.

This doesn't make much sense to me.  The previous patch indicates
there's work to be done in updating the kvm-vfio contract to understand
DMA coherency, so you're trying to limit use cases to those where the
IOMMU enforces coherency, but there's QEMU work to be done to support
the iommufd uAPI at all.  Isn't part of that work to understand how KVM
will be told about non-coherent devices rather than "meh, skip it in the
kernel"?  Also let's not forget that vfio is not only for KVM.
 
> > When the device is opened via /dev/vfio/devices, vfio-pci should prevent
> > the user from accessing the assigned device because the device is still
> > attached to the default domain which may allow user-initiated DMAs to
> > touch arbitrary place. The user access must be blocked until the device
> > is later bound to an iommufd (see patch 08). The binding acts as the
> > contract for putting the device in a security context which ensures user-
> > initiated DMAs via this device cannot harm the rest of the system.
> > 
> > This patch introduces a vdev->block_access flag for this purpose. It's set
> > when the device is opened via /dev/vfio/devices and cleared after binding
> > to iommufd succeeds. mmap and r/w handlers check this flag to decide whether
> > user access should be blocked or not.  
> 
> This should not be in vfio_pci.
> 
> AFAIK there is no condition where a vfio driver can work without being
> connected to some kind of iommu back end, so the core code should
> handle this interlock globally. A vfio driver's ops should not be
> callable until the iommu is connected.
> 
> The only vfio_pci patch in this series should be adding a new callback
> op to take in an iommufd and register the pci_device as a iommufd
> device.

Couldn't the same argument be made that registering a $bus device as an
iommufd device is a common interface that shouldn't be the
responsibility of the vfio device driver?  Is userspace opening the
non-group device anything more than a reservation of that device if
access is withheld until iommu isolation?  I also don't really want to
predict how ioctls might evolve to guess whether only blocking .read,
.write, and .mmap callbacks are sufficient.  Thanks,

Alex

___
iommu mailing list
iommu@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/iommu


Re: [RFC 02/20] vfio: Add device class for /dev/vfio/devices

2021-09-21 Thread Alex Williamson
On Sun, 19 Sep 2021 14:38:30 +0800
Liu Yi L  wrote:

> This patch introduces a new interface (/dev/vfio/devices/$DEVICE) for
> userspace to directly open a vfio device w/o relying on container/group
> (/dev/vfio/$GROUP). Anything related to group is now hidden behind
> iommufd (more specifically in iommu core by this RFC) in a device-centric
> manner.
> 
> In case a device is exposed in both legacy and new interfaces (see next
> patch for how to decide it), this patch also ensures that when the device
> is already opened via one interface then the other one must be blocked.
> 
> Signed-off-by: Liu Yi L 
> ---
>  drivers/vfio/vfio.c  | 228 +++
>  include/linux/vfio.h |   2 +
>  2 files changed, 213 insertions(+), 17 deletions(-)
> 
> diff --git a/drivers/vfio/vfio.c b/drivers/vfio/vfio.c
> index 02cc51ce6891..84436d7abedd 100644
> --- a/drivers/vfio/vfio.c
> +++ b/drivers/vfio/vfio.c
...
> @@ -2295,6 +2436,52 @@ static struct miscdevice vfio_dev = {
>   .mode = S_IRUGO | S_IWUGO,
>  };
>  
> +static char *vfio_device_devnode(struct device *dev, umode_t *mode)
> +{
> + return kasprintf(GFP_KERNEL, "vfio/devices/%s", dev_name(dev));
> +}

dev_name() doesn't provide us with any uniqueness guarantees, so this
could potentially generate naming conflicts.  The similar scheme for
devices within an iommu group appends an instance number if a conflict
occurs, but that solution doesn't work here where the name isn't just a
link to the actual device.  Devices within an iommu group are also
likely associated within a bus_type, so the potential for conflict is
pretty negligible, that's not the case as vfio is adopted for new
device types.  Thanks,

Alex

___
iommu mailing list
iommu@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/iommu


Re: [RFC][PATCH v2 00/13] iommu/arm-smmu-v3: Add NVIDIA implementation

2021-08-31 Thread Alex Williamson
On Mon, 30 Aug 2021 19:59:10 -0700
Nicolin Chen  wrote:

> The SMMUv3 devices implemented in the Grace SoC support NVIDIA's custom
> CMDQ-Virtualization (CMDQV) hardware. Like the new ECMDQ feature first
> introduced in the ARM SMMUv3.3 specification, CMDQV adds multiple VCMDQ
> interfaces to supplement the single architected SMMU_CMDQ in an effort
> to reduce contention.
> 
> This series of patches add CMDQV support with its preparational changes:
> 
> * PATCH-1 to PATCH-8 are related to shared VMID feature: they are used
>   first to improve TLB utilization, second to bind a shared VMID with a
>   VCMDQ interface for hardware configuring requirement.

The vfio changes would need to be implemented in alignment with the
/dev/iommu proposals[1].  AIUI, the VMID is essentially binding
multiple containers together for TLB invalidation, which I expect in
the proposal below is largely already taken care of in that a single
iommu-fd can support multiple I/O address spaces and it's largely
expected that a hypervisor would use a single iommu-fd so this explicit
connection by userspace across containers wouldn't be necessary.

We're expecting to talk more about the /dev/iommu approach at Plumbers
in few weeks.  Thanks,

Alex

[1]https://lore.kernel.org/kvm/bn9pr11mb5433b1e4ae5b0480369f97178c...@bn9pr11mb5433.namprd11.prod.outlook.com/

 
> * PATCH-9 and PATCH-10 are to accommodate the NVIDIA implementation with
>   the existing arm-smmu-v3 driver.
> 
> * PATCH-11 borrows the "implementation infrastructure" from the arm-smmu
>   driver so later change can build upon it.
> 
> * PATCH-12 adds an initial NVIDIA implementation related to host feature,
>   and also adds implementation specific ->device_reset() and ->get_cmdq()
>   callback functions.
> 
> * PATCH-13 adds virtualization features using VFIO mdev interface, which
>   allows user space hypervisor to map and get access to one of the VCMDQ
>   interfaces of CMDQV module.
> 
> ( Thinking that reviewers can get a better view of this implementation,
>   I am attaching QEMU changes here for reference purpose:
>   https://github.com/nicolinc/qemu/commits/dev/cmdqv_v6.0.0-rc2
>   The branch has all preparational changes, while I'm still integrating
>   device model and ARM-VIRT changes, and will push them these two days,
>   although they might not be in a good shape of being sent to review yet )
> 
> Above all, I marked RFC for this series, as I feel that we may come up
> some better solution. So please kindly share your reviews and insights.
> 
> Thank you!
> 
> Changelog
> v1->v2:
>  * Added mdev interface support for hypervisor and VMs.
>  * Added preparational changes for mdev interface implementation.
>  * PATCH-12 Changed ->issue_cmdlist() to ->get_cmdq() for a better
>integration with recently merged ECMDQ-related changes.
> 
> Nate Watterson (3):
>   iommu/arm-smmu-v3: Add implementation infrastructure
>   iommu/arm-smmu-v3: Add support for NVIDIA CMDQ-Virtualization hw
>   iommu/nvidia-smmu-v3: Add mdev interface support
> 
> Nicolin Chen (10):
>   iommu: Add set_nesting_vmid/get_nesting_vmid functions
>   vfio: add VFIO_IOMMU_GET_VMID and VFIO_IOMMU_SET_VMID
>   vfio: Document VMID control for IOMMU Virtualization
>   vfio: add set_vmid and get_vmid for vfio_iommu_type1
>   vfio/type1: Implement set_vmid and get_vmid
>   vfio/type1: Set/get VMID to/from iommu driver
>   iommu/arm-smmu-v3: Add shared VMID support for NESTING
>   iommu/arm-smmu-v3: Add VMID alloc/free helpers
>   iommu/arm-smmu-v3: Pass dev pointer to arm_smmu_detach_dev
>   iommu/arm-smmu-v3: Pass cmdq pointer in arm_smmu_cmdq_issue_cmdlist()
> 
>  Documentation/driver-api/vfio.rst |   34 +
>  MAINTAINERS   |2 +
>  drivers/iommu/arm/arm-smmu-v3/Makefile|2 +-
>  .../iommu/arm/arm-smmu-v3/arm-smmu-v3-impl.c  |   15 +
>  drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c   |  121 +-
>  drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.h   |   18 +
>  .../iommu/arm/arm-smmu-v3/nvidia-smmu-v3.c| 1249 +
>  drivers/iommu/iommu.c |   20 +
>  drivers/vfio/vfio.c   |   25 +
>  drivers/vfio/vfio_iommu_type1.c   |   37 +
>  include/linux/iommu.h |5 +
>  include/linux/vfio.h  |2 +
>  include/uapi/linux/vfio.h |   26 +
>  13 files changed, 1537 insertions(+), 19 deletions(-)
>  create mode 100644 drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3-impl.c
>  create mode 100644 drivers/iommu/arm/arm-smmu-v3/nvidia-smmu-v3.c
> 

___
iommu mailing list
iommu@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/iommu


Re: [RFC v2] /dev/iommu uAPI proposal

2021-07-13 Thread Alex Williamson
On Tue, 13 Jul 2021 09:55:03 -0300
Jason Gunthorpe  wrote:

> On Mon, Jul 12, 2021 at 11:56:24PM +, Tian, Kevin wrote:
> 
> > Maybe I misunderstood your question. Are you specifically worried
> > about establishing the security context for a mdev vs. for its
> > parent?  
> 
> The way to think about the cookie, and the device bind/attach in
> general, is as taking control of a portion of the IOMMU routing:
> 
>  - RID
>  - RID + PASID
>  - "software"
> 
> For the first two there can be only one device attachment per value so
> the cookie is unambiguous.
> 
> For "software" the iommu layer has little to do with this - everything
> is constructed outside by the mdev. If the mdev wishes to communicate
> on /dev/iommu using the cookie then it has to do so using some iommufd
> api and we can convay the proper device at that point.
> 
> Kevin didn't show it, but along side the PCI attaches:
> 
> struct iommu_attach_data * iommu_pci_device_attach(
> struct iommu_dev *dev, struct pci_device *pdev,
> u32 ioasid);
> 
> There would also be a software attach for mdev:
> 
> struct iommu_attach_data * iommu_sw_device_attach(
> struct iommu_dev *dev, struct device *pdev, u32 ioasid);
> 
> Which does not connect anything to the iommu layer.
> 
> It would have to return something that allows querying the IO page
> table, and the mdev would use that API instead of vfio_pin_pages().


Quoting this proposal again:

> 1)  A successful binding call for the first device in the group creates 
> the security context for the entire group, by:
> 
> * Verifying group viability in a similar way as VFIO does;
> 
> * Calling IOMMU-API to move the group into a block-dma state,
>   which makes all devices in the group attached to an block-dma
>   domain with an empty I/O page table;
> 
> VFIO should not allow the user to mmap the MMIO bar of the bound
> device until the binding call succeeds.

The attach step is irrelevant to my question, the bind step is where
the device/group gets into a secure state for device access.

So for IGD we have two scenarios, direct assignment and software mdevs.

AIUI the operation of VFIO_DEVICE_BIND_IOMMU_FD looks like this:

iommu_ctx = iommu_ctx_fdget(iommu_fd);

mdev = mdev_from_dev(vdev->dev);
dev = mdev ? mdev_parent_dev(mdev) : vdev->dev;

iommu_dev = iommu_register_device(iommu_ctx, dev, cookie);

In either case, this last line is either registering the IGD itself
(ie. the struct device representing PCI device :00:02.0) or the
parent of the GVT-g mdev (ie. the struct device representing PCI device
:00:02.0).  They're the same!  AIUI, the cookie is simply an
arbitrary user generated value which they'll use to refer to this
device via the iommu_fd uAPI.

So what magic is iommu_register_device() doing to infer my intentions
as to whether I'm asking for the IGD RID to be isolated or I'm only
creating a software context for an mdev?  Thanks,

Alex

___
iommu mailing list
iommu@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/iommu


Re: [RFC v2] /dev/iommu uAPI proposal

2021-07-12 Thread Alex Williamson
On Mon, 12 Jul 2021 01:22:11 +
"Tian, Kevin"  wrote:
> > From: Alex Williamson 
> > Sent: Saturday, July 10, 2021 5:51 AM
> > On Fri, 9 Jul 2021 07:48:44 +
> > "Tian, Kevin"  wrote:  
 
> > > For mdev the struct device should be the pointer to the parent device.  
> > 
> > I don't get how iommu_register_device() differentiates an mdev from a
> > pdev in this case.  
> 
> via device cookie.


Let me re-add this section for more context:

> 3. Sample structures and helper functions
> 
> 
> Three helper functions are provided to support VFIO_BIND_IOMMU_FD:
> 
>   struct iommu_ctx *iommu_ctx_fdget(int fd);
>   struct iommu_dev *iommu_register_device(struct iommu_ctx *ctx,
>   struct device *device, u64 cookie);
>   int iommu_unregister_device(struct iommu_dev *dev);
> 
> An iommu_ctx is created for each fd:
> 
>   struct iommu_ctx {
>   // a list of allocated IOASID data's
>   struct xarray   ioasid_xa;
> 
>   // a list of registered devices
>   struct xarray   dev_xa;
>   };
> 
> Later some group-tracking fields will be also introduced to support 
> multi-devices group.
> 
> Each registered device is represented by iommu_dev:
> 
>   struct iommu_dev {
>   struct iommu_ctx*ctx;
>   // always be the physical device
>   struct device   *device;
>   u64 cookie;
>   struct kref kref;
>   };
> 
> A successful binding establishes a security context for the bound
> device and returns struct iommu_dev pointer to the caller. After this
> point, the user is allowed to query device capabilities via IOMMU_
> DEVICE_GET_INFO.
> 
> For mdev the struct device should be the pointer to the parent device. 


So we'll have a VFIO_DEVICE_BIND_IOMMU_FD ioctl where the user provides
the iommu_fd and a cookie.  vfio will use iommu_ctx_fdget() to get an
iommu_ctx* for that iommu_fd, then we'll call iommu_register_device()
using that iommu_ctx* we got from the iommu_fd, the cookie provided by
the user, and for an mdev, the parent of the device the user owns
(the device_fd on which this ioctl is called)...

How does an arbitrary user provided cookie let you differentiate that
the request is actually for an mdev versus the parent device itself?

For instance, how can the IOMMU layer distinguish GVT-g (mdev) vs GVT-d
(direct assignment) when both use the same struct device* and cookie is
just a user provided value?  Still confused.  Thanks,

Alex

___
iommu mailing list
iommu@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/iommu


Re: [RFC v2] /dev/iommu uAPI proposal

2021-07-09 Thread Alex Williamson
Hi Kevin,

A couple first pass comments...

On Fri, 9 Jul 2021 07:48:44 +
"Tian, Kevin"  wrote:
> 2.2. /dev/vfio device uAPI
> ++
> 
> /*
>   * Bind a vfio_device to the specified IOMMU fd
>   *
>   * The user should provide a device cookie when calling this ioctl. The 
>   * cookie is later used in IOMMU fd for capability query, iotlb invalidation
>   * and I/O fault handling.
>   *
>   * User is not allowed to access the device before the binding operation
>   * is completed.
>   *
>   * Unbind is automatically conducted when device fd is closed.
>   *
>   * Input parameters:
>   *   - iommu_fd;
>   *   - cookie;
>   *
>   * Return: 0 on success, -errno on failure.
>   */
> #define VFIO_BIND_IOMMU_FD_IO(VFIO_TYPE, VFIO_BASE + 22)

I believe this is an ioctl on the device fd, therefore it should be
named VFIO_DEVICE_BIND_IOMMU_FD.

> 
> 
> /*
>   * Report vPASID info to userspace via VFIO_DEVICE_GET_INFO
>   *
>   * Add a new device capability. The presence indicates that the user
>   * is allowed to create multiple I/O address spaces on this device. The
>   * capability further includes following flags:
>   *
>   *   - PASID_DELEGATED, if clear every vPASID must be registered to 
>   * the kernel;
>   *   - PASID_CPU, if set vPASID is allowed to be carried in the CPU 
>   * instructions (e.g. ENQCMD);
>   *   - PASID_CPU_VIRT, if set require vPASID translation in the CPU; 
>   * 
>   * The user must check that all devices with PASID_CPU set have the 
>   * same setting on PASID_CPU_VIRT. If mismatching, it should enable 
>   * vPASID only in one category (all set, or all clear).
>   *
>   * When the user enables vPASID on the device with PASID_CPU_VIRT
>   * set, it must enable vPASID CPU translation via kvm fd before attempting
>   * to use ENQCMD to submit work items. The command portal is blocked 
>   * by the kernel until the CPU translation is enabled.
>   */
> #define VFIO_DEVICE_INFO_CAP_PASID5
> 
> 
> /*
>   * Attach a vfio device to the specified IOASID
>   *
>   * Multiple vfio devices can be attached to the same IOASID, and vice 
>   * versa. 
>   *
>   * User may optionally provide a "virtual PASID" to mark an I/O page 
>   * table on this vfio device, if PASID_DELEGATED is not set in device info. 
>   * Whether the virtual PASID is physically used or converted to another 
>   * kernel-allocated PASID is a policy in the kernel.
>   *
>   * Because one device is allowed to bind to multiple IOMMU fd's, the
>   * user should provide both iommu_fd and ioasid for this attach operation.
>   *
>   * Input parameter:
>   *   - iommu_fd;
>   *   - ioasid;
>   *   - flag;
>   *   - vpasid (if specified);
>   * 
>   * Return: 0 on success, -errno on failure.
>   */
> #define VFIO_ATTACH_IOASID_IO(VFIO_TYPE, VFIO_BASE + 23)
> #define VFIO_DETACH_IOASID_IO(VFIO_TYPE, VFIO_BASE + 24)

Likewise, VFIO_DEVICE_{ATTACH,DETACH}_IOASID

...
> 3. Sample structures and helper functions
> 
> 
> Three helper functions are provided to support VFIO_BIND_IOMMU_FD:
> 
>   struct iommu_ctx *iommu_ctx_fdget(int fd);
>   struct iommu_dev *iommu_register_device(struct iommu_ctx *ctx,
>   struct device *device, u64 cookie);
>   int iommu_unregister_device(struct iommu_dev *dev);
> 
> An iommu_ctx is created for each fd:
> 
>   struct iommu_ctx {
>   // a list of allocated IOASID data's
>   struct xarray   ioasid_xa;
> 
>   // a list of registered devices
>   struct xarray   dev_xa;
>   };
> 
> Later some group-tracking fields will be also introduced to support 
> multi-devices group.
> 
> Each registered device is represented by iommu_dev:
> 
>   struct iommu_dev {
>   struct iommu_ctx*ctx;
>   // always be the physical device
>   struct device   *device;
>   u64 cookie;
>   struct kref kref;
>   };
> 
> A successful binding establishes a security context for the bound
> device and returns struct iommu_dev pointer to the caller. After this
> point, the user is allowed to query device capabilities via IOMMU_
> DEVICE_GET_INFO.

If we have an initial singleton group only restriction, I assume that
both iommu_register_device() would fail for any devices that are not in
a singleton group and vfio would only expose direct device files for
the devices in singleton groups.  The latter implementation could
change when multi-device group support is added so that userspace can
assume that if the vfio device file exists, this interface is available.
I think this is confirmed further below.

> For mdev the struct device should be the pointer to the parent device. 

I don't get how iommu_register_device() differentiates an mdev from a
pdev in this case.

...
> 4.3. IOASID nesting (software)
> 

Re: Plan for /dev/ioasid RFC v2

2021-06-28 Thread Alex Williamson
On Mon, 28 Jun 2021 19:48:18 -0300
Jason Gunthorpe  wrote:

> On Mon, Jun 28, 2021 at 04:31:45PM -0600, Alex Williamson wrote:
> 
> > I'd expect that /dev/iommu will be used by multiple subsystems.  All
> > will want to bind devices to address spaces, so shouldn't binding a
> > device to an iommufd be an ioctl on the iommufd, ie.
> > IOMMU_BIND_VFIO_DEVICE_FD.  Maybe we don't even need "VFIO" in there and
> > the iommufd code can figure it out internally.  
> 
> It wants to be the other way around because iommu_fd is the lower
> level subsystem. We don't/can't teach iommu_fd how to convert a fd
> number to a vfio/vdpa/etc/etc, we teach all the things building on
> iommu_fd how to change a fd number to an iommu - they already
> necessarily have an inter-module linkage.

These seem like peer subsystems, like vfio and kvm.  vfio shouldn't
have any hard dependencies on the iommufd module, especially so long as
we have the legacy type1 code.  Likewise iommufd shouldn't have any on
vfio.  As much as you dislike the symbol_get hack of the kvm-vfio
device, it would be reasonable for iommufd to reach for a vfio symbol
when an IOMMU_BIND_VFIO_DEVICE_FD ioctl is called.

> There is a certain niceness to what you are saying but it is not so
> practical without doing something bigger..
> 
> > Ideally vfio would also at least be able to register a type1 IOMMU
> > backend through the existing uapi, backed by this iommu code, ie. we'd
> > create a new "iommufd" (but without the user visible fd),   
> 
> It would be amazing to be able to achieve this, at least for me there
> are too many details be able to tell what that would look like
> exactly. I suggested once that putting the container ioctl interface
> in the drivers/iommu code may allow for this without too much trouble..

If we can't achieve this, then type1 legacy code is going to need to
live through an extended deprecation period.  I'm hoping that between
type1 and a native interface we'll have two paths into iommufd to vet
the design.  Thanks,

Alex

___
iommu mailing list
iommu@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/iommu


Re: Plan for /dev/ioasid RFC v2

2021-06-28 Thread Alex Williamson
On Mon, 28 Jun 2021 01:09:18 +
"Tian, Kevin"  wrote:

> > From: Jason Gunthorpe 
> > Sent: Friday, June 25, 2021 10:36 PM
> > 
> > On Fri, Jun 25, 2021 at 10:27:18AM +, Tian, Kevin wrote:
> >   
> > > -   When receiving the binding call for the 1st device in a group, 
> > > iommu_fd
> > > calls iommu_group_set_block_dma(group, dev->driver) which does
> > > several things:  
> > 
> > The whole problem here is trying to match this new world where we want
> > devices to be in charge of their own IOMMU configuration and the old
> > world where groups are in charge.
> > 
> > Inserting the group fd and then calling a device-centric
> > VFIO_GROUP_GET_DEVICE_FD_NEW doesn't solve this conflict, and isn't
> > necessary.   
> 
> No, this was not what I meant. There is no group fd required when
> calling this device-centric interface. I was actually talking about:
> 
>   iommu_group_set_block_dma(dev->group, dev->driver)
> 
> just because current iommu layer API is group-centric. Whether this
> should be improved could be next-level thing. Sorry for not making
> it clear in the first place.
> 
> > We can always get the group back from the device at any
> > point in the sequence do to a group wide operation.  
> 
> yes.
> 
> > 
> > What I saw as the appeal of the sort of idea was to just completely
> > leave all the difficult multi-device-group scenarios behind on the old
> > group centric API and then we don't have to deal with them at all, or
> > least not right away.  
> 
> yes, this is the staged approach that we discussed earlier. and
> the reason why I refined this proposal about multi-devices group 
> here is because you want to see some confidence along this
> direction. Thus I expanded your idea and hope to achieve consensus
> with Alex/Joerg who obviously have not been convinced yet.
> 
> > 
> > I'd see some progression where iommu_fd only works with 1:1 groups at
> > the start. Other scenarios continue with the old API.  
> 
> One uAPI open after completing this new sketch. v1 proposed to
> conduct binding (VFIO_BIND_IOMMU_FD) after device_fd is acquired.
> With this sketch we need a new VFIO_GROUP_GET_DEVICE_FD_NEW
> to complete both in one step. I want to get Alex's confirmation whether
> it sounds good to him, since it's better to unify the uAPI between 1:1 
> group and 1:N group even if we don't support 1:N in the start. 

I don't like it.  It doesn't make sense to me.  You have the
group-centric world, which must continue to exist and cannot change
because we cannot break the vfio uapi.  We can make extensions, we can
define a new parallel uapi, we can deprecate the uapi, but in the short
term, it can't change.

AIUI, the new device-centric model starts with vfio device files that
can be opened directly.  So what then is the purpose of a *GROUP* get
device fd?  Why is a vfio uapi involved in setting a device cookie for
another subsystem?

I'd expect that /dev/iommu will be used by multiple subsystems.  All
will want to bind devices to address spaces, so shouldn't binding a
device to an iommufd be an ioctl on the iommufd, ie.
IOMMU_BIND_VFIO_DEVICE_FD.  Maybe we don't even need "VFIO" in there and
the iommufd code can figure it out internally.

You're essentially trying to reduce vfio to the device interface.  That
necessarily implies that ioctls on the container, group, or passed
through the container to the iommu no longer exist.  From my
perspective, there should ideally be no new vfio ioctls.  The user gets
a limited access vfio device fd and enables full access to the device
by registering it to the iommufd subsystem (100% this needs to be
enforced until close() to avoid revoke issues).  The user interacts
exclusively with vfio via the device fd and performs all DMA address
space related operations through the iommufd.

> > Then maybe groups where all devices use the same IOASID.
> > 
> > Then 1:N groups if the source device is reliably identifiable, this
> > requires iommu subystem work to attach domains to sub-group objects -
> > not sure it is worthwhile.
> > 
> > But at least we can talk about each step with well thought out patches
> > 
> > The only thing that needs to be done to get the 1:1 step is to broadly
> > define how the other two cases will work so we don't get into trouble
> > and set some way to exclude the problematic cases from even getting to
> > iommu_fd in the first place.
> > 
> > For instance if we go ahead and create /dev/vfio/device nodes we could
> > do this only if the group was 1:1, otherwise the group cdev has to be
> > used, along with its API.  
> 
> I feel for VFIO possibly we don't need significant change to its uAPI 
> sequence, since it anyway needs to support existing semantics for 
> backward compatibility. With this sketch we can keep vfio container/
> group by introducing an external iommu type which implies a different
> GET_DEVICE_FD semantics. /dev/iommu can report a fd-wide capability
> for whether 1:N group is supported to vfio user.


Re: Plan for /dev/ioasid RFC v2

2021-06-18 Thread Alex Williamson
On Fri, 18 Jun 2021 08:37:35 -0700
"Raj, Ashok"  wrote:

> On Fri, Jun 18, 2021 at 12:15:06PM -0300, Jason Gunthorpe wrote:
> > On Fri, Jun 18, 2021 at 03:47:51PM +0200, Joerg Roedel wrote:  
> > > Hi Kevin,
> > > 
> > > On Thu, Jun 17, 2021 at 07:31:03AM +, Tian, Kevin wrote:  
> > > > Now let's talk about the new IOMMU behavior:
> > > > 
> > > > -   A device is blocked from doing DMA to any resource outside of
> > > > its group when it's probed by the IOMMU driver. This could be a
> > > > special state w/o attaching to any domain, or a new special domain
> > > > type which differentiates it from existing domain types (identity, 
> > > > dma, or unmanged). Actually existing code already includes a
> > > > IOMMU_DOMAIN_BLOCKED type but nobody uses it.  
> > > 
> > > There is a reason for the default domain to exist: Devices which require
> > > RMRR mappings to be present. You can't just block all DMA from devices
> > > until a driver takes over, we put much effort into making sure there is
> > > not even a small window in time where RMRR regions (unity mapped regions
> > > on AMD) are not mapped.  
> > 
> > Yes, I think the DMA blocking can only start around/after a VFIO type
> > driver has probed() and bound to a device in the group, not much
> > different from today.  
> 
> Does this mean when a device has a required "RMRR" that requires a unity
> mapping we block assigning those devices to guests? I remember we had some
> restriction but there was a need to go around it at some point in time.
> 
> - Either we disallow assigning devices with RMRR
> - Break that unity map when the device is probed and after which any RMRR
>   access from device will fault.

We currently disallow assignment of RMRR encumbered devices except for
the known cases of USB and IGD.  In the general case, an RMRR imposes
a requirement on the host system to maintain ranges of identity mapping
that is incompatible with userspace ownership of the device and IOVA
address space.  AFAICT, nothing changes in the /dev/iommu model that
would make it safe to entrust userspace with RMRR encumbered devices.
Thanks,

Alex

___
iommu mailing list
iommu@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/iommu


Re: Plan for /dev/ioasid RFC v2

2021-06-17 Thread Alex Williamson
On Thu, 17 Jun 2021 07:31:03 +
"Tian, Kevin"  wrote:

> > From: Alex Williamson 
> > Sent: Thursday, June 17, 2021 3:40 AM
> > 
> > On Wed, 16 Jun 2021 06:43:23 +
> > "Tian, Kevin"  wrote:
> >   
> > > > From: Alex Williamson 
> > > > Sent: Wednesday, June 16, 2021 12:12 AM
> > > >
> > > > On Tue, 15 Jun 2021 02:31:39 +
> > > > "Tian, Kevin"  wrote:
> > > >  
> > > > > > From: Alex Williamson 
> > > > > > Sent: Tuesday, June 15, 2021 12:28 AM
> > > > > >  
> > > > > [...]  
> > > > > > > IOASID. Today the group fd requires an IOASID before it hands out 
> > > > > > > a
> > > > > > > device_fd. With iommu_fd the device_fd will not allow IOCTLs 
> > > > > > > until  
> > it  
> > > > > > > has a blocked DMA IOASID and is successefully joined to an  
> > iommu_fd.  
> > > > > >
> > > > > > Which is the root of my concern.  Who owns ioctls to the device fd?
> > > > > > It's my understanding this is a vfio provided file descriptor and 
> > > > > > it's
> > > > > > therefore vfio's responsibility.  A device-level IOASID interface
> > > > > > therefore requires that vfio manage the group aspect of device 
> > > > > > access.
> > > > > > AFAICT, that means that device access can therefore only begin when 
> > > > > >  
> > all  
> > > > > > devices for a given group are attached to the IOASID and must halt 
> > > > > > for
> > > > > > all devices in the group if any device is ever detached from an 
> > > > > > IOASID,
> > > > > > even temporarily.  That suggests a lot more oversight of the 
> > > > > > IOASIDs  
> > by  
> > > > > > vfio than I'd prefer.
> > > > > >  
> > > > >
> > > > > This is possibly the point that is worthy of more clarification and
> > > > > alignment, as it sounds like the root of controversy here.
> > > > >
> > > > > I feel the goal of vfio group management is more about ownership, i.e.
> > > > > all devices within a group must be assigned to a single user. 
> > > > > Following
> > > > > the three rules defined by Jason, what we really care is whether a 
> > > > > group
> > > > > of devices can be isolated from the rest of the world, i.e. no access 
> > > > > to
> > > > > memory/device outside of its security context and no access to its
> > > > > security context from devices outside of this group. This can be  
> > achieved  
> > > > > as long as every device in the group is either in block-DMA state when
> > > > > it's not attached to any security context or attached to an IOASID  
> > context  
> > > > > in IOMMU fd.
> > > > >
> > > > > As long as group-level isolation is satisfied, how devices within a 
> > > > > group
> > > > > are further managed is decided by the user (unattached, all attached 
> > > > > to
> > > > > same IOASID, attached to different IOASIDs) as long as the user
> > > > > understands the implication of lacking of isolation within the group. 
> > > > >  
> > This  
> > > > > is what a device-centric model comes to play. Misconfiguration just  
> > hurts  
> > > > > the user itself.
> > > > >
> > > > > If this rationale can be agreed, then I didn't see the point of 
> > > > > having VFIO
> > > > > to mandate all devices in the group must be attached/detached in
> > > > > lockstep.  
> > > >
> > > > In theory this sounds great, but there are still too many assumptions
> > > > and too much hand waving about where isolation occurs for me to feel
> > > > like I really have the complete picture.  So let's walk through some
> > > > examples.  Please fill in and correct where I'm wrong.  
> > >
> > > Thanks for putting these examples. They are helpful for clearing the
> > > whole picture.
> > >
> > > Before filling in let's first align on what is the key difference between
> > > current VFIO

Re: Plan for /dev/ioasid RFC v2

2021-06-16 Thread Alex Williamson
On Wed, 16 Jun 2021 06:43:23 +
"Tian, Kevin"  wrote:

> > From: Alex Williamson 
> > Sent: Wednesday, June 16, 2021 12:12 AM
> > 
> > On Tue, 15 Jun 2021 02:31:39 +
> > "Tian, Kevin"  wrote:
> >   
> > > > From: Alex Williamson 
> > > > Sent: Tuesday, June 15, 2021 12:28 AM
> > > >  
> > > [...]  
> > > > > IOASID. Today the group fd requires an IOASID before it hands out a
> > > > > device_fd. With iommu_fd the device_fd will not allow IOCTLs until it
> > > > > has a blocked DMA IOASID and is successefully joined to an iommu_fd.  
> > > >
> > > > Which is the root of my concern.  Who owns ioctls to the device fd?
> > > > It's my understanding this is a vfio provided file descriptor and it's
> > > > therefore vfio's responsibility.  A device-level IOASID interface
> > > > therefore requires that vfio manage the group aspect of device access.
> > > > AFAICT, that means that device access can therefore only begin when all
> > > > devices for a given group are attached to the IOASID and must halt for
> > > > all devices in the group if any device is ever detached from an IOASID,
> > > > even temporarily.  That suggests a lot more oversight of the IOASIDs by
> > > > vfio than I'd prefer.
> > > >  
> > >
> > > This is possibly the point that is worthy of more clarification and
> > > alignment, as it sounds like the root of controversy here.
> > >
> > > I feel the goal of vfio group management is more about ownership, i.e.
> > > all devices within a group must be assigned to a single user. Following
> > > the three rules defined by Jason, what we really care is whether a group
> > > of devices can be isolated from the rest of the world, i.e. no access to
> > > memory/device outside of its security context and no access to its
> > > security context from devices outside of this group. This can be achieved
> > > as long as every device in the group is either in block-DMA state when
> > > it's not attached to any security context or attached to an IOASID context
> > > in IOMMU fd.
> > >
> > > As long as group-level isolation is satisfied, how devices within a group
> > > are further managed is decided by the user (unattached, all attached to
> > > same IOASID, attached to different IOASIDs) as long as the user
> > > understands the implication of lacking of isolation within the group. This
> > > is what a device-centric model comes to play. Misconfiguration just hurts
> > > the user itself.
> > >
> > > If this rationale can be agreed, then I didn't see the point of having 
> > > VFIO
> > > to mandate all devices in the group must be attached/detached in
> > > lockstep.  
> > 
> > In theory this sounds great, but there are still too many assumptions
> > and too much hand waving about where isolation occurs for me to feel
> > like I really have the complete picture.  So let's walk through some
> > examples.  Please fill in and correct where I'm wrong.  
> 
> Thanks for putting these examples. They are helpful for clearing the 
> whole picture.
> 
> Before filling in let's first align on what is the key difference between
> current VFIO model and this new proposal. With this comparison we'll
> know which of following questions are answered with existing VFIO
> mechanism and which are handled differently.
> 
> With Yi's help we figured out the current mechanism:
> 
> 1) vfio_group_viable. The code comment explains the intention clearly:
> 
> --
> * A vfio group is viable for use by userspace if all devices are in
>  * one of the following states:
>  *  - driver-less
>  *  - bound to a vfio driver
>  *  - bound to an otherwise allowed driver
>  *  - a PCI interconnect device
> --
> 
> Note this check is not related to an IOMMU security context.

Because this is a pre-requisite for imposing that IOMMU security
context.
 
> 2) vfio_iommu_group_notifier. When an IOMMU_GROUP_NOTIFY_
> BOUND_DRIVER event is notified, vfio_group_viable is re-evaluated.
> If the affected group was previously viable but now becomes not 
> viable, BUG_ON() as it implies that this device is bound to a non-vfio 
> driver which breaks the group isolation.

This notifier action is conditional on there being users of devices
within a secure group IOMMU context.

> 3) vfio_group_get_device_fd. User can acquire a device fd only after
>   a) the group is viable;
> 

Re: Plan for /dev/ioasid RFC v2

2021-06-15 Thread Alex Williamson
On Tue, 15 Jun 2021 01:21:35 +
"Tian, Kevin"  wrote:

> > From: Jason Gunthorpe 
> > Sent: Monday, June 14, 2021 9:38 PM
> > 
> > On Mon, Jun 14, 2021 at 03:09:31AM +, Tian, Kevin wrote:
> >   
> > > If a device can be always blocked from accessing memory in the IOMMU
> > > before it's bound to a driver or more specifically before the driver
> > > moves it to a new security context, then there is no need for VFIO
> > > to track whether IOASIDfd has taken over ownership of the DMA
> > > context for all devices within a group.  
> > 
> > I've been assuming we'd do something like this, where when a device is
> > first turned into a VFIO it tells the IOMMU layer that this device
> > should be DMA blocked unless an IOASID is attached to
> > it. Disconnecting an IOASID returns it to blocked.  
> 
> Or just make sure a device is in block-DMA when it's unbound from a
> driver or a security context. Then no need to explicitly tell IOMMU layer 
> to do so when it's bound to a new driver.
> 
> Currently the default domain type applies even when a device is not
> bound. This implies that if iommu=passthrough a device is always 
> allowed to access arbitrary system memory with or without a driver.
> I feel the current domain type (identity, dma, unmanged) should apply
> only when a driver is loaded...

Note that vfio does not currently require all devices in the group to
be bound to drivers.  Other devices within the group, those bound to
vfio drivers, can be used in this configuration.  This is not
necessarily recommended though as a non-vfio, non-stub driver binding
to one of those devices can trigger a BUG_ON.

> > > If this works I didn't see the need for vfio to keep the sequence.
> > > VFIO still keeps group fd to claim ownership of all devices in a
> > > group.  
> > 
> > As Alex says you still have to deal with the problem that device A in
> > a group can gain control of device B in the same group.  
> 
> There is no isolation in the group then how could vfio prevent device
> A from gaining control of device B? for example when both are attached
> to the same GPA address space with device MMIO bar included, devA
> can do p2p to devB. It's all user's policy how to deal with devices within
> the group. 

The latter is user policy, yes, but it's a system security issue that
the user cannot use device A to control device B if the user doesn't
have access to both devices, ie. doesn't own the group.  vfio would
prevent this by not allowing access to device A while device B is
insecure and would require that all devices within the group remain in
a secure, user owned state for the extent of access to device A.

> > This means device A and B can not be used from to two different
> > security contexts.  
> 
> It depends on how the security context is defined. From iommu layer
> p.o.v, an IOASID is a security context which isolates a device from
> the rest of the system (but not the sibling in the same group). As you
> suggested earlier, it's completely sane if an user wants to attach
> devices in a group to different IOASIDs. Here I just talk about this fact.

This is sane, yes, but that doesn't give us license to allow the user
to access device A regardless of the state of device B.

> > 
> > If the /dev/iommu FD is the security context then the tracking is
> > needed there.
> >   
> 
> As I replied to Alex, my point is that VFIO doesn't need to know the
> attaching status of each device in a group before it can allow user to
> access a device. As long as a device in a group either in block DMA
> or switch to a new address space created via /dev/iommu FD, there's
> no problem to allow user accessing it. User cannot do harm to the
> world outside of the group. User knows there is no isolation within
> the group. that is it.

This is self contradictory, "vfio doesn't need to know the attachment
status"... "[a]s long as a device in a group either in block DMA or
switch to a new address space".  So vfio does need to know the latter.
How does it know that?  Thanks,

Alex

___
iommu mailing list
iommu@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/iommu


Re: Plan for /dev/ioasid RFC v2

2021-06-15 Thread Alex Williamson
On Tue, 15 Jun 2021 02:31:39 +
"Tian, Kevin"  wrote:

> > From: Alex Williamson 
> > Sent: Tuesday, June 15, 2021 12:28 AM
> >   
> [...]
> > > IOASID. Today the group fd requires an IOASID before it hands out a
> > > device_fd. With iommu_fd the device_fd will not allow IOCTLs until it
> > > has a blocked DMA IOASID and is successefully joined to an iommu_fd.  
> > 
> > Which is the root of my concern.  Who owns ioctls to the device fd?
> > It's my understanding this is a vfio provided file descriptor and it's
> > therefore vfio's responsibility.  A device-level IOASID interface
> > therefore requires that vfio manage the group aspect of device access.
> > AFAICT, that means that device access can therefore only begin when all
> > devices for a given group are attached to the IOASID and must halt for
> > all devices in the group if any device is ever detached from an IOASID,
> > even temporarily.  That suggests a lot more oversight of the IOASIDs by
> > vfio than I'd prefer.
> >   
> 
> This is possibly the point that is worthy of more clarification and
> alignment, as it sounds like the root of controversy here.
> 
> I feel the goal of vfio group management is more about ownership, i.e. 
> all devices within a group must be assigned to a single user. Following
> the three rules defined by Jason, what we really care is whether a group
> of devices can be isolated from the rest of the world, i.e. no access to
> memory/device outside of its security context and no access to its 
> security context from devices outside of this group. This can be achieved
> as long as every device in the group is either in block-DMA state when 
> it's not attached to any security context or attached to an IOASID context 
> in IOMMU fd.
> 
> As long as group-level isolation is satisfied, how devices within a group 
> are further managed is decided by the user (unattached, all attached to 
> same IOASID, attached to different IOASIDs) as long as the user 
> understands the implication of lacking of isolation within the group. This 
> is what a device-centric model comes to play. Misconfiguration just hurts 
> the user itself.
> 
> If this rationale can be agreed, then I didn't see the point of having VFIO
> to mandate all devices in the group must be attached/detached in
> lockstep. 

In theory this sounds great, but there are still too many assumptions
and too much hand waving about where isolation occurs for me to feel
like I really have the complete picture.  So let's walk through some
examples.  Please fill in and correct where I'm wrong.

1) A dual-function PCIe e1000e NIC where the functions are grouped
   together due to ACS isolation issues.

   a) Initial state: functions 0 & 1 are both bound to e1000e driver.

   b) Admin uses driverctl to bind function 1 to vfio-pci, creating
  vfio device file, which is chmod'd to grant to a user.

   c) User opens vfio function 1 device file and an iommu_fd, binds
   device_fd to iommu_fd.

   Does this succeed?
 - if no, specifically where does it fail?
 - if yes, vfio can now allow access to the device?

   d) Repeat b) for function 0.

   e) Repeat c), still using function 1, is it different?  Where?  Why?

2) The same NIC as 1)

   a) Initial state: functions 0 & 1 bound to vfio-pci, vfio device
  files granted to user, user has bound both device_fds to the same
  iommu_fd.

   AIUI, even though not bound to an IOASID, vfio can now enable access
   through the device_fds, right?  What specific entity has placed these
   devices into a block DMA state, when, and how?

   b) Both devices are attached to the same IOASID.

   Are we assuming that each device was atomically moved to the new
   IOMMU context by the IOASID code?  What if the IOMMU cannot change
   the domain atomically?

   c) The device_fd for function 1 is detached from the IOASID.

   Are we assuming the reverse of b) performed by the IOASID code?

   d) The device_fd for function 1 is unbound from the iommu_fd.

   Does this succeed?
 - if yes, what is the resulting IOMMU context of the device and
   who owns it?
 - if no, well, that results in numerous tear-down issues.

   e) Function 1 is unbound from vfio-pci.

   Does this work or is it blocked?  If blocked, by what entity
   specifically?

   f) Function 1 is bound to e1000e driver.

   We clearly have a violation here, specifically where and by who in
   this path should have prevented us from getting here or who pushes
   the BUG_ON to abort this?

3) A dual-function conventional PCI e1000 NIC where the functions are
   grouped together due to shared RID.

   a) Repeat 2.a) and 2.b) such that we have a valid, user accessible
  devices in the same IOMMU context.

  

Re: Plan for /dev/ioasid RFC v2

2021-06-14 Thread Alex Williamson
On Mon, 14 Jun 2021 11:07:11 -0300
Jason Gunthorpe  wrote:

> On Sat, Jun 12, 2021 at 10:57:11AM -0600, Alex Williamson wrote:
> > On Fri, 11 Jun 2021 22:28:46 -0300
> > Jason Gunthorpe  wrote:
> >   
> > > On Fri, Jun 11, 2021 at 01:38:28PM -0600, Alex Williamson wrote:
> > >   
> > > > That's fine for a serial port, but not a device that can do DMA.
> > > > The entire point of vfio is to try to provide secure, DMA capable
> > > > userspace drivers.  If we relax enforcement of that isolation we've
> > > > failed.
> > > 
> > > I don't understand why the IOASID matters at all in this. Can you
> > > explain? What is the breach of isolation?  
> > 
> > I think we're arguing past each other again.  VFIO does not care one
> > iota how userspace configures IOASID domains for devices.  OTOH, VFIO
> > must be absolutely obsessed that the devices we're providing userspace
> > access to are isolated and continue to be isolated for the extent of
> > that access.  Given that we define that a group is the smallest set of
> > devices that can be isolated, that means that for a device to be
> > isolated, the group needs to be isolated.
> > 
> > VFIO currently has a contract with the IOMMU backend that a group is
> > attached to an IOMMU context (container) and from that point forward,
> > all devices within that group are known to be isolated.  
> 
> Sure - and maybe this is the source of the confusion as I've been
> assuming we'd change the kernel to match what we are doing. As in the
> other note a device under VFIO control should immediately have it's
> IOMMU programmed to block all DMA. This is basically attaching it to a
> dummy ioasid with an empty page table.
> 
> So before VFIO exposes any char device all devices/groups under VFIO
> control cannot do any DMA. The only security/isolation harmful action
> they can do is DMA to devices in the same group.
> 
> > I'm trying to figure out how a device based interface to the IOASID can
> > provide that same contract or whether VFIO needs to be able to monitor
> > the IOASID attachments of the devices in a group to control whether
> > device access is secure.  
> 
> Can you define what specifically secure, and isolation means?
> 
> To my mind it is these three things:
> 
>  1. The device can only do DMA to memory put into its security context

System memory or device memory, yes.

Corollary: The IOMMU group defines the minimum set of devices where the
IOMMU can control inter-device DMA.

>  2. No other security context can control this device
>  3. No other security context can do DMA to my userspace memory

Rule #1 is essentially the golden rule, the rest falls out from it.

> Today in VFIO the security context is the group fd. I would like the
> security context to be the iommu fd.

The vfio group is simply a representation of the IOMMU group, which is
the minimum isolation granularity.  The group is therefore the minimum
security context, but itself is not a security context.  The overall
security context for vfio is the set of containers (IOMMU contexts)
owned by a user, where each container defines the IOMMU context for a
set of groups.  The user can map process and device memory between
containers within the same security context.

As you know, we have various issues with invalidation of device
mappings between containers, so simplifying the security context to the
ioasidfd seems like a good plan.  The vfio notion of a container is
already encompassed in the IOASID of the ioasidfd.

The significant difference is therefore the device level IOASID versus
vfio's group level container granularity.  This means the IOASID model
needs to incorporate the group model not only in terms of isolation,
but also address-ability.  The vfio model allows these to be combined
as a significant simplification.

> 1 is achieved by ensuring the device is always connected to an

s/device/group/

As you note in reply to Kevin, in a multi-device group rule #1 can be
violated if only one device is connected to an IOASID.

> IOASID. Today the group fd requires an IOASID before it hands out a
> device_fd. With iommu_fd the device_fd will not allow IOCTLs until it
> has a blocked DMA IOASID and is successefully joined to an iommu_fd.

Which is the root of my concern.  Who owns ioctls to the device fd?
It's my understanding this is a vfio provided file descriptor and it's
therefore vfio's responsibility.  A device-level IOASID interface
therefore requires that vfio manage the group aspect of device access.
AFAICT, that means that device access can therefore only begin when all
devices for a given group are attached to the IOASID and must halt

Re: Plan for /dev/ioasid RFC v2

2021-06-13 Thread Alex Williamson
On Mon, 14 Jun 2021 03:09:31 +
"Tian, Kevin"  wrote:

> > From: Alex Williamson 
> > Sent: Saturday, June 12, 2021 5:39 AM
> > 
> > On Fri, 11 Jun 2021 00:58:35 +
> > "Tian, Kevin"  wrote:
> >   
> > > Hi, Alex,
> > >  
> > > > From: Alex Williamson 
> > > > Sent: Thursday, June 10, 2021 11:39 PM
> > > >
> > > > On Wed, 9 Jun 2021 15:49:40 -0300
> > > > Jason Gunthorpe  wrote:
> > > >  
> > > > > On Wed, Jun 09, 2021 at 10:27:22AM -0600, Alex Williamson wrote:
> > > > >  
> > > > > > > > It is a kernel decision, because a fundamental task of the 
> > > > > > > > kernel is  
> > to  
> > > > > > > > ensure isolation between user-space tasks as good as it can. 
> > > > > > > > And if  
> > a  
> > > > > > > > device assigned to one task can interfer with a device of 
> > > > > > > > another  
> > task  
> > > > > > > > (e.g. by sending P2P messages), then the promise of isolation 
> > > > > > > > is  
> > > > broken.  
> > > > > > >
> > > > > > > AIUI, the IOASID model will still enforce IOMMU groups, but it's 
> > > > > > > not  
> > an  
> > > > > > > explicit part of the interface like it is for vfio.  For example 
> > > > > > > the
> > > > > > > IOASID model allows attaching individual devices such that we have
> > > > > > > granularity to create per device IOASIDs, but all devices within 
> > > > > > > an
> > > > > > > IOMMU group are required to be attached to an IOASID before they  
> > can  
> > > > be  
> > > > > > > used.  
> > > > >
> > > > > Yes, thanks Alex
> > > > >  
> > > > > > > It's not entirely clear to me yet how that last bit gets
> > > > > > > implemented though, ie. what barrier is in place to prevent device
> > > > > > > usage prior to reaching this viable state.  
> > > > >
> > > > > The major security checkpoint for the group is on the VFIO side.  We
> > > > > must require the group before userspace can be allowed access to any
> > > > > device registers. Obtaining the device_fd from the group_fd does this
> > > > > today as the group_fd is the security proof.
> > > > >
> > > > > Actually, thinking about this some more.. If the only way to get a
> > > > > working device_fd in the first place is to get it from the group_fd
> > > > > and thus pass a group-based security check, why do we need to do
> > > > > anything at the ioasid level?
> > > > >
> > > > > The security concept of isolation was satisfied as soon as userspace
> > > > > opened the group_fd. What do more checks in the kernel accomplish?  
> > > >
> > > > Opening the group is not the extent of the security check currently
> > > > required, the group must be added to a container and an IOMMU model
> > > > configured for the container *before* the user can get a devicefd.
> > > > Each devicefd creates a reference to this security context, therefore
> > > > access to a device does not exist without such a context.  
> > >
> > > IIUC each device has a default domain when it's probed by iommu driver
> > > at boot time. This domain includes an empty page table, implying that
> > > device is already in a security context before it's probed by device 
> > > driver.  
> > 
> > The default domain could be passthrough though, right?  
> 
> Good point.
> 
> >   
> > > Now when this device is added to vfio, vfio creates another security
> > > context through above sequence. This sequence requires the device to
> > > switch from default security context to this new one, before it can be
> > > accessed by user.  
> > 
> > This is true currently, we use group semantics with the type1 IOMMU
> > backend to attach all devices in the group to a secure context,
> > regardless of the default domain.
> >   
> > > Then I wonder whether it's really necessary. As long as a device is in
> > > a security context at any time, access to a device can be allowed. The
> > &

Re: Plan for /dev/ioasid RFC v2

2021-06-12 Thread Alex Williamson
On Fri, 11 Jun 2021 22:28:46 -0300
Jason Gunthorpe  wrote:

> On Fri, Jun 11, 2021 at 01:38:28PM -0600, Alex Williamson wrote:
> 
> > That's fine for a serial port, but not a device that can do DMA.
> > The entire point of vfio is to try to provide secure, DMA capable
> > userspace drivers.  If we relax enforcement of that isolation we've
> > failed.  
> 
> I don't understand why the IOASID matters at all in this. Can you
> explain? What is the breach of isolation?

I think we're arguing past each other again.  VFIO does not care one
iota how userspace configures IOASID domains for devices.  OTOH, VFIO
must be absolutely obsessed that the devices we're providing userspace
access to are isolated and continue to be isolated for the extent of
that access.  Given that we define that a group is the smallest set of
devices that can be isolated, that means that for a device to be
isolated, the group needs to be isolated.

VFIO currently has a contract with the IOMMU backend that a group is
attached to an IOMMU context (container) and from that point forward,
all devices within that group are known to be isolated.

I'm trying to figure out how a device based interface to the IOASID can
provide that same contract or whether VFIO needs to be able to monitor
the IOASID attachments of the devices in a group to control whether
device access is secure.

As I outlined to Kevin, I think it makes a lot of sense to maintain a
group interface to the IOASID where registering a group signifies a
hand-off of responsibility to the IOASIDfd that it is responsible for
the isolation of those devices.  From there we can determine the value
of exposing VFIO device fds directly and whether any of the VFIO
interfaces for attaching devices to IOASIDs make sense versus switching
to the IOASIDfd at that point.

Otherwise, for a device centric VFIO/IOASID model, I need to understand
exactly when and how VFIO can know that it's safe to provide access to
a device and how the IOASID model guarantees the ongoing safety of that
access, which must encompass the safety relative to the entire group.

For example, is it VFIO's job to BIND every device in the group?  Does
binding the device represent the point at which the IOASID takes
responsibility for the isolation of the device?  If instead it's the
ATTACH of a device that provides the isolation, how is VFIO supposed to
handle device access across a group when one device is DETACH'd by the
user?  If ATTACH is the point where isolation is guaranteed, can a
DETACH occur through the IOASIDfd rather than the VFIOfd?  It seems
like the IOASIDfd is going to need ways to manipulate device:IOASID
mappings outside of VFIO, so again I wonder if we should switch to an
IOASID uAPI at that point rather than using VFIO.  Thanks,

Alex

___
iommu mailing list
iommu@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/iommu


Re: Plan for /dev/ioasid RFC v2

2021-06-11 Thread Alex Williamson
On Fri, 11 Jun 2021 00:58:35 +
"Tian, Kevin"  wrote:

> Hi, Alex,
> 
> > From: Alex Williamson 
> > Sent: Thursday, June 10, 2021 11:39 PM
> > 
> > On Wed, 9 Jun 2021 15:49:40 -0300
> > Jason Gunthorpe  wrote:
> >   
> > > On Wed, Jun 09, 2021 at 10:27:22AM -0600, Alex Williamson wrote:
> > >  
> > > > > > It is a kernel decision, because a fundamental task of the kernel 
> > > > > > is to
> > > > > > ensure isolation between user-space tasks as good as it can. And if 
> > > > > > a
> > > > > > device assigned to one task can interfer with a device of another 
> > > > > > task
> > > > > > (e.g. by sending P2P messages), then the promise of isolation is  
> > broken.  
> > > > >
> > > > > AIUI, the IOASID model will still enforce IOMMU groups, but it's not 
> > > > > an
> > > > > explicit part of the interface like it is for vfio.  For example the
> > > > > IOASID model allows attaching individual devices such that we have
> > > > > granularity to create per device IOASIDs, but all devices within an
> > > > > IOMMU group are required to be attached to an IOASID before they can  
> > be  
> > > > > used.  
> > >
> > > Yes, thanks Alex
> > >  
> > > > > It's not entirely clear to me yet how that last bit gets
> > > > > implemented though, ie. what barrier is in place to prevent device
> > > > > usage prior to reaching this viable state.  
> > >
> > > The major security checkpoint for the group is on the VFIO side.  We
> > > must require the group before userspace can be allowed access to any
> > > device registers. Obtaining the device_fd from the group_fd does this
> > > today as the group_fd is the security proof.
> > >
> > > Actually, thinking about this some more.. If the only way to get a
> > > working device_fd in the first place is to get it from the group_fd
> > > and thus pass a group-based security check, why do we need to do
> > > anything at the ioasid level?
> > >
> > > The security concept of isolation was satisfied as soon as userspace
> > > opened the group_fd. What do more checks in the kernel accomplish?  
> > 
> > Opening the group is not the extent of the security check currently
> > required, the group must be added to a container and an IOMMU model
> > configured for the container *before* the user can get a devicefd.
> > Each devicefd creates a reference to this security context, therefore
> > access to a device does not exist without such a context.  
> 
> IIUC each device has a default domain when it's probed by iommu driver
> at boot time. This domain includes an empty page table, implying that
> device is already in a security context before it's probed by device driver.

The default domain could be passthrough though, right?

> Now when this device is added to vfio, vfio creates another security 
> context through above sequence. This sequence requires the device to
> switch from default security context to this new one, before it can be
> accessed by user.

This is true currently, we use group semantics with the type1 IOMMU
backend to attach all devices in the group to a secure context,
regardless of the default domain.

> Then I wonder whether it's really necessary. As long as a device is in
> a security context at any time, access to a device can be allowed. The
> user itself should ensure that the access happens only after the device
> creates a reference to the new security context that is desired by this
> user.
>
> Then what does group really bring to us?

By definition an IOMMU group is the smallest set of devices that we
can consider isolated from all other devices.  Therefore devices in a
group are not necessarily isolated from each other.  Therefore if any
device within a group is not isolated, the group is not isolated.  VFIO
needs to know when it's safe to provide userspace access to the device,
but the device isolation is dependent on the group isolation.  The
group is therefore part of this picture whether implicit or explicit.

> With this new proposal we just need to make sure that a device cannot
> be attached to any IOASID before all devices in its group are bound to
> the IOASIDfd. If we want to start with a vfio-like policy, then all devices
> in the group must be attached to the same IOASID. Or as Jason suggests,
> they can attach to different IOASIDs (if in the group due to !ACS) if the
> user wants, or have some devic

Re: Plan for /dev/ioasid RFC v2

2021-06-11 Thread Alex Williamson
On Fri, 11 Jun 2021 13:45:29 -0300
Jason Gunthorpe  wrote:

> On Thu, Jun 10, 2021 at 09:38:42AM -0600, Alex Williamson wrote:
> 
> > Opening the group is not the extent of the security check currently
> > required, the group must be added to a container and an IOMMU model
> > configured for the container *before* the user can get a devicefd.
> > Each devicefd creates a reference to this security context, therefore
> > access to a device does not exist without such a context.  
> 
> Okay, I missed that detail in the organization..
> 
> So, if we have an independent vfio device fd then it needs to be
> kept disable until the user joins it to an ioasid that provides the
> security proof to allow it to work?

Yes, the user would effectively get a dummy fd with no device access
until not only that device, but every device in the IOMMU group is
attached to a secure context.  Then we get into questions about whether
devices can be moved between contexts/ioasids within the same ioasidfd
and what that implies to both the device and all other devices within
the group as a device is transitioned and the system is potentially
exposed.
 
> > What happens on detach?  As we've discussed elsewhere in this thread,
> > revoking access is more difficult than holding a reference to the
> > secure context, but I'm under the impression that moving a device
> > between IOASIDs could be standard practice in this new model.  A device
> > that's detached from a secure context, even temporarily, is a
> > problem.  
> 
> This is why I think the single iommu FD is critical, it is the FD, not
> the IOASID that has to authorize the security. You shouldn't move
> devices between FDs, but you can move them between IOASIDs inside the
> same FD.

Right, but that doesn't solve the issue.  Removing a device from one
isolated context, even if to move it to another isolated context within
the same ioasidfd exposes the device and has implications for all
devices within the group.

> > How to label a device seems like a relatively mundane issue relative to
> > ownership and isolated contexts of groups and devices.  The label is
> > essentially just creating an identifier to device mapping, where the
> > identifier (label) will be used in the IOASID interface, right?   
> 
> It looks that way
> 
> > As I note above, that makes it difficult for vfio to maintain that a
> > user only accesses a device in a secure context.  This is exactly
> > why vfio has the model of getting a devicefd from a groupfd only
> > when that group is in a secure context and maintaining references to
> > that secure context for each device.  Split ownership of the secure
> > context in IOASID vs device access in vfio and exposing devicefds
> > outside the group is still a big question mark for me.  Thanks,  
> 
> I think the protection model becomes different once we allow
> individual devices inside a group to be attached to different
> IOASID's.
> 
> Now we just want some general authorization that the user is allowed
> to operate the device_fd.

That's fine for a serial port, but not a device that can do DMA.  The
entire point of vfio is to try to provide secure, DMA capable userspace
drivers.  If we relax enforcement of that isolation we've failed.

> To keep a fairly similar model to the way vfio does things today..
> 
>  - The device_fd is single open, so only one fd exists globally
>
>  - Upon first joining the iommu_fd the group is obtained inside
>the iommu_fd. This is only possible if no other iommu_fd has
>obtained the group

vfio_groups have an ownership model, iommu_groups do not.
 
>  - If the group can not be obtained then the device_fd is left
>inoperable and cannot control the device
> 
>  - If multiple devices in the same group are joined then they all
>refcount the group
> 
> It is simple, and gives semantics similar to VFIO with the notable
> difference that process can obtain a device FD, it is just inoperable
> until the iommu_fd is attached.
> 
> Removal is OK as if you remove the device_fd from the iommu_fd (only
> allowed by closing it) then a newly opened FD is inoperable.

I don't see how this provides isolation.  If a user only needs to
attach their devicefd to an ioasidfd to have full access to their
device, not even bound by attaching to an ioasid context, then we've
failed.  All devices in a group must be bound to a secure context for
the extent of the time that any device in the group is operated by a
user.  That seems non-negotiable to me.  Thanks,

Alex

___
iommu mailing list
iommu@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/iommu


Re: Plan for /dev/ioasid RFC v2

2021-06-10 Thread Alex Williamson
On Wed, 9 Jun 2021 15:49:40 -0300
Jason Gunthorpe  wrote:

> On Wed, Jun 09, 2021 at 10:27:22AM -0600, Alex Williamson wrote:
> 
> > > > It is a kernel decision, because a fundamental task of the kernel is to
> > > > ensure isolation between user-space tasks as good as it can. And if a
> > > > device assigned to one task can interfer with a device of another task
> > > > (e.g. by sending P2P messages), then the promise of isolation is 
> > > > broken.
> > > 
> > > AIUI, the IOASID model will still enforce IOMMU groups, but it's not an
> > > explicit part of the interface like it is for vfio.  For example the
> > > IOASID model allows attaching individual devices such that we have
> > > granularity to create per device IOASIDs, but all devices within an
> > > IOMMU group are required to be attached to an IOASID before they can be
> > > used.
> 
> Yes, thanks Alex
> 
> > > It's not entirely clear to me yet how that last bit gets
> > > implemented though, ie. what barrier is in place to prevent device
> > > usage prior to reaching this viable state.  
> 
> The major security checkpoint for the group is on the VFIO side.  We
> must require the group before userspace can be allowed access to any
> device registers. Obtaining the device_fd from the group_fd does this
> today as the group_fd is the security proof.
> 
> Actually, thinking about this some more.. If the only way to get a
> working device_fd in the first place is to get it from the group_fd
> and thus pass a group-based security check, why do we need to do
> anything at the ioasid level?
> 
> The security concept of isolation was satisfied as soon as userspace
> opened the group_fd. What do more checks in the kernel accomplish?

Opening the group is not the extent of the security check currently
required, the group must be added to a container and an IOMMU model
configured for the container *before* the user can get a devicefd.
Each devicefd creates a reference to this security context, therefore
access to a device does not exist without such a context.

This proposal has of course put the device before the group, which then
makes it more difficult for vfio to retroactively enforce security.

> Yes, we have the issue where some groups require all devices to use
> the same IOASID, but once someone has the group_fd that is no longer a
> security issue. We can fail VFIO_DEVICE_ATTACH_IOASID callss that
> don't make sense.

The groupfd only proves the user has an ownership claim to the devices,
it does not itself prove that the devices are in an isolated context.
Device access is not granted until that isolated context is configured.

vfio owns the device, so it would make sense for vfio to enforce the
security of device access only in a secure context, but how do we know
a device is in a secure context?

Is it sufficient to track the vfio device ioctls for attach/detach for
an IOASID or will the user be able to manipulate IOASID configuration
for a device directly via the IOASIDfd?

What happens on detach?  As we've discussed elsewhere in this thread,
revoking access is more difficult than holding a reference to the
secure context, but I'm under the impression that moving a device
between IOASIDs could be standard practice in this new model.  A device
that's detached from a secure context, even temporarily, is a problem.
Access to other devices in the same group as a device detached from a
secure context is a problem.

> > > > > Groups should be primarily about isolation security, not about IOASID
> > > > > matching.  
> > > > 
> > > > That doesn't make any sense, what do you mean by 'IOASID matching'?
> > > 
> > > One of the problems with the vfio interface use of groups is that we
> > > conflate the IOMMU group for both isolation and granularity.  I think
> > > what Jason is referring to here is that we still want groups to be the
> > > basis of isolation, but we don't want a uAPI that presumes all devices
> > > within the group must use the same IOASID.
> 
> Yes, thanks again Alex
> 
> > > For example, if a user owns an IOMMU group consisting of
> > > non-isolated functions of a multi-function device, they should be
> > > able to create a vIOMMU VM where each of those functions has its
> > > own address space.  That can't be done today, the entire group
> > > would need to be attached to the VM under a PCIe-to-PCI bridge to
> > > reflect the address space limitation imposed by the vfio group
> > > uAPI model.  Thanks,  
> > 
> > Hmm, likely discussed previously in the

Re: [RFC] /dev/ioasid uAPI proposal

2021-06-09 Thread Alex Williamson
On Wed, 9 Jun 2021 02:49:32 +
"Tian, Kevin"  wrote:

> > From: Alex Williamson 
> > Sent: Wednesday, June 9, 2021 2:47 AM
> > 
> > On Tue, 8 Jun 2021 15:44:26 +0200
> > Paolo Bonzini  wrote:
> >   
> > > On 08/06/21 15:15, Jason Gunthorpe wrote:  
> > > > On Tue, Jun 08, 2021 at 09:56:09AM +0200, Paolo Bonzini wrote:
> > > >  
> > > >>>> Alternatively you can add a KVM_DEV_IOASID_{ADD,DEL} pair of  
> > ioctls. But it  
> > > >>>> seems useless complication compared to just using what we have  
> > now, at least  
> > > >>>> while VMs only use IOASIDs via VFIO.  
> > > >>>
> > > >>> The simplest is KVM_ENABLE_WBINVD() and be  
> > done  
> > > >>> with it.  
> > 
> > Even if we were to relax wbinvd access to any device (capable of
> > no-snoop or not) in any IOMMU configuration (blocking no-snoop or not),
> > I think as soon as we say "proof" is required to gain this access then
> > that proof should be ongoing for the life of the access.
> > 
> > That alone makes this more than a "I want this feature, here's my
> > proof", one-shot ioctl.  Like the groupfd enabling a path for KVM to
> > ask if that group is non-coherent and holding a group reference to
> > prevent the group from being used to authorize multiple KVM instances,
> > the ioasidfd proof would need to minimally validate that devices are
> > present and provide some reference to enforce that model as ongoing, or
> > notifier to indicate an end of that authorization.  I don't think we
> > can simplify that out of the equation or we've essentially invalidated
> > that the proof is really required.
> >   
> > > >>
> > > >> The simplest one is KVM_DEV_VFIO_GROUP_ADD/DEL, that already  
> > exists and also  
> > > >> covers hot-unplug.  The second simplest one is  
> > KVM_DEV_IOASID_ADD/DEL.  
> > > >
> > > > This isn't the same thing, this is back to trying to have the kernel
> > > > set policy for userspace.  
> > >
> > > If you want a userspace policy then there would be three states:
> > >
> > > * WBINVD enabled because a WBINVD-enabled VFIO device is attached.
> > >
> > > * WBINVD potentially enabled but no WBINVD-enabled VFIO device  
> > attached  
> > >
> > > * WBINVD forcefully disabled
> > >
> > > KVM_DEV_VFIO_GROUP_ADD/DEL can still be used to distinguish the first
> > > two.  Due to backwards compatibility, those two describe the default
> > > behavior; disabling wbinvd can be done easily with a new sub-ioctl of
> > > KVM_ENABLE_CAP and doesn't require any security proof.  
> > 
> > That seems like a good model, use the kvm-vfio device for the default
> > behavior and extend an existing KVM ioctl if QEMU still needs a way to
> > tell KVM to assume all DMA is coherent, regardless of what the kvm-vfio
> > device reports.
> > 
> > If feels like we should be able to support a backwards compatibility
> > mode using the vfio group, but I expect long term we'll want to
> > transition the kvm-vfio device from a groupfd to an ioasidfd.
> >   
> > > The meaning of WBINVD-enabled is "won't return -ENXIO for the wbinvd
> > > ioctl", nothing more nothing less.  If all VFIO devices are going to be
> > > WBINVD-enabled, then that will reflect on KVM as well, and I won't have
> > > anything to object if there's consensus on the device assignment side of
> > > things that the wbinvd ioctl won't ever fail.  
> > 
> > If we create the IOMMU vs device coherency matrix:
> > 
> > \ Device supports
> > IOMMU blocks \   no-snoop
> >   no-snoop\  yes | no  |
> > ---+-+-+
> >yes |  1  |  2  |
> > ---+-+-+
> >no  |  3  |  4  |
> > ---+-+-+
> > 
> > DMA is always coherent in boxes {1,2,4} (wbinvd emulation is not
> > needed).  VFIO will currently always configure the IOMMU for {1,2} when
> > the feature is supported.  Boxes {3,4} are where we'll currently
> > emulate wbinvd.  The best we could do, not knowing the guest or
> > insights into the guest driver would be to only emulate wbinvd for {3}.
> > 
> > The majority of devices appear to report no-snoop support {1,3}, but
> > the claim is that it's mostly unused outside of GPUs, effecti

Re: Plan for /dev/ioasid RFC v2

2021-06-09 Thread Alex Williamson
On Wed, 9 Jun 2021 10:15:32 -0600
Alex Williamson  wrote:

> On Wed, 9 Jun 2021 17:51:26 +0200
> Joerg Roedel  wrote:
> 
> > On Wed, Jun 09, 2021 at 12:00:09PM -0300, Jason Gunthorpe wrote:  
> > > Only *drivers* know what the actual device is going to do, devices do
> > > not. Since the group doesn't have drivers it is the wrong layer to be
> > > making choices about how to configure the IOMMU.
> > 
> > Groups don't carry how to configure IOMMUs, that information is
> > mostly in the IOMMU domains. And those (or an abstraction of them) is
> > configured through /dev/ioasid. So not sure what you wanted to say with
> > the above.
> > 
> > All a group carries is information about which devices are not
> > sufficiently isolated from each other and thus need to always be in the
> > same domain.
> >   
> > > The device centric approach is my attempt at this, and it is pretty
> > > clean, I think.
> > 
> > Clean, but still insecure.
> >   
> > > All ACS does is prevent P2P operations, if you assign all the group
> > > devices into the same /dev/iommu then you may not care about that
> > > security isolation property. At the very least it is policy for user
> > > to decide, not kernel.
> > 
> > It is a kernel decision, because a fundamental task of the kernel is to
> > ensure isolation between user-space tasks as good as it can. And if a
> > device assigned to one task can interfer with a device of another task
> > (e.g. by sending P2P messages), then the promise of isolation is broken.  
> 
> AIUI, the IOASID model will still enforce IOMMU groups, but it's not an
> explicit part of the interface like it is for vfio.  For example the
> IOASID model allows attaching individual devices such that we have
> granularity to create per device IOASIDs, but all devices within an
> IOMMU group are required to be attached to an IOASID before they can be
> used.  It's not entirely clear to me yet how that last bit gets
> implemented though, ie. what barrier is in place to prevent device
> usage prior to reaching this viable state.
> 
> > > Groups should be primarily about isolation security, not about IOASID
> > > matching.
> > 
> > That doesn't make any sense, what do you mean by 'IOASID matching'?  
> 
> One of the problems with the vfio interface use of groups is that we
> conflate the IOMMU group for both isolation and granularity.  I think
> what Jason is referring to here is that we still want groups to be the
> basis of isolation, but we don't want a uAPI that presumes all devices
> within the group must use the same IOASID.  For example, if a user owns
> an IOMMU group consisting of non-isolated functions of a multi-function
> device, they should be able to create a vIOMMU VM where each of those
> functions has its own address space.  That can't be done today, the
> entire group would need to be attached to the VM under a PCIe-to-PCI
> bridge to reflect the address space limitation imposed by the vfio
> group uAPI model.  Thanks,

Hmm, likely discussed previously in these threads, but I can't come up
with the argument that prevents us from making the BIND interface
at the group level but the ATTACH interface at the device level?  For
example:

 - VFIO_GROUP_BIND_IOASID_FD
 - VFIO_DEVICE_ATTACH_IOASID

AFAICT that makes the group ownership more explicit but still allows
the device level IOASID granularity.  Logically this is just an
internal iommu_group_for_each_dev() in the BIND ioctl.  Thanks,

Alex

___
iommu mailing list
iommu@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/iommu


Re: Plan for /dev/ioasid RFC v2

2021-06-09 Thread Alex Williamson
On Wed, 9 Jun 2021 17:51:26 +0200
Joerg Roedel  wrote:

> On Wed, Jun 09, 2021 at 12:00:09PM -0300, Jason Gunthorpe wrote:
> > Only *drivers* know what the actual device is going to do, devices do
> > not. Since the group doesn't have drivers it is the wrong layer to be
> > making choices about how to configure the IOMMU.  
> 
> Groups don't carry how to configure IOMMUs, that information is
> mostly in the IOMMU domains. And those (or an abstraction of them) is
> configured through /dev/ioasid. So not sure what you wanted to say with
> the above.
> 
> All a group carries is information about which devices are not
> sufficiently isolated from each other and thus need to always be in the
> same domain.
> 
> > The device centric approach is my attempt at this, and it is pretty
> > clean, I think.  
> 
> Clean, but still insecure.
> 
> > All ACS does is prevent P2P operations, if you assign all the group
> > devices into the same /dev/iommu then you may not care about that
> > security isolation property. At the very least it is policy for user
> > to decide, not kernel.  
> 
> It is a kernel decision, because a fundamental task of the kernel is to
> ensure isolation between user-space tasks as good as it can. And if a
> device assigned to one task can interfer with a device of another task
> (e.g. by sending P2P messages), then the promise of isolation is broken.

AIUI, the IOASID model will still enforce IOMMU groups, but it's not an
explicit part of the interface like it is for vfio.  For example the
IOASID model allows attaching individual devices such that we have
granularity to create per device IOASIDs, but all devices within an
IOMMU group are required to be attached to an IOASID before they can be
used.  It's not entirely clear to me yet how that last bit gets
implemented though, ie. what barrier is in place to prevent device
usage prior to reaching this viable state.

> > Groups should be primarily about isolation security, not about IOASID
> > matching.  
> 
> That doesn't make any sense, what do you mean by 'IOASID matching'?

One of the problems with the vfio interface use of groups is that we
conflate the IOMMU group for both isolation and granularity.  I think
what Jason is referring to here is that we still want groups to be the
basis of isolation, but we don't want a uAPI that presumes all devices
within the group must use the same IOASID.  For example, if a user owns
an IOMMU group consisting of non-isolated functions of a multi-function
device, they should be able to create a vIOMMU VM where each of those
functions has its own address space.  That can't be done today, the
entire group would need to be attached to the VM under a PCIe-to-PCI
bridge to reflect the address space limitation imposed by the vfio
group uAPI model.  Thanks,

Alex

___
iommu mailing list
iommu@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/iommu


Re: [RFC] /dev/ioasid uAPI proposal

2021-06-09 Thread Alex Williamson
On Wed, 9 Jun 2021 08:54:45 -0300
Jason Gunthorpe  wrote:

> On Wed, Jun 09, 2021 at 11:11:17AM +0200, Paolo Bonzini wrote:
> > On 09/06/21 10:51, Enrico Weigelt, metux IT consult wrote:  
> > > On 08.06.21 21:00, Jason Gunthorpe wrote:
> > >   
> > > > Eg I can do open() on a file and I get to keep that FD. I get to keep
> > > > that FD even if someone later does chmod() on that file so I can't
> > > > open it again.
> > > > 
> > > > There are lots of examples where a one time access control check
> > > > provides continuing access to a resource. I feel the ongoing proof is
> > > > the rarity in Unix.. 'revoke' is an uncommon concept in Unix..  
> > > 
> > > Yes, it's even possible that somebody w/ privileges opens an fd and
> > > hands it over to somebody unprivileged (eg. via unix socket). This is
> > > a very basic unix concept. If some (already opened) fd now suddenly
> > > behaves differently based on the current caller, that would be a break
> > > with traditional unix semantics.  

That's not the scenario here, this would work as expected.  It's not a
matter of who uses the fd it's that a property of the fd that provided
elevated access has been removed.  I only need to look as far as sudo
to find examples of protocols where having access at some point in time
does not guarantee ongoing access.

If we go back to the wbinvd ioctl mechanism, if I call that ioctl with
an ioasidfd that contains no devices, then I shouldn't be able to
generate a wbinvd on the processor, right?  If I add a device,
especially in a configuration that can generate non-coherent DMA, now
that ioctl should work.  If I then remove all devices from that ioasid,
what then is the difference from the initial state.  Should the ioctl
now work because it worked once in the past?

If we equate KVM to the process access to the ioctl, then a reference
or notification method should be used to retain or mark the end of that
access.  This is no different than starting a shell via sudo (ie. an
ongoing reference) or having the previous authentication time out, or
in our case be notified it has expired.

> > That's already more or less meaningless for both KVM and VFIO, since they
> > are tied to an mm.  
> 
> vfio isn't supposed to be tied to a mm.

vfio does accounting against an mm, why shouldn't it be tied to an mm?
Maybe you mean in the new model where vfio is just a device driver?
Thanks,

Alex

___
iommu mailing list
iommu@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/iommu


Re: [RFC] /dev/ioasid uAPI proposal

2021-06-08 Thread Alex Williamson
On Tue, 8 Jun 2021 15:44:26 +0200
Paolo Bonzini  wrote:

> On 08/06/21 15:15, Jason Gunthorpe wrote:
> > On Tue, Jun 08, 2021 at 09:56:09AM +0200, Paolo Bonzini wrote:
> >   
>  Alternatively you can add a KVM_DEV_IOASID_{ADD,DEL} pair of ioctls. But 
>  it
>  seems useless complication compared to just using what we have now, at 
>  least
>  while VMs only use IOASIDs via VFIO.  
> >>>
> >>> The simplest is KVM_ENABLE_WBINVD() and be done
> >>> with it.  

Even if we were to relax wbinvd access to any device (capable of
no-snoop or not) in any IOMMU configuration (blocking no-snoop or not),
I think as soon as we say "proof" is required to gain this access then
that proof should be ongoing for the life of the access.

That alone makes this more than a "I want this feature, here's my
proof", one-shot ioctl.  Like the groupfd enabling a path for KVM to
ask if that group is non-coherent and holding a group reference to
prevent the group from being used to authorize multiple KVM instances,
the ioasidfd proof would need to minimally validate that devices are
present and provide some reference to enforce that model as ongoing, or
notifier to indicate an end of that authorization.  I don't think we
can simplify that out of the equation or we've essentially invalidated
that the proof is really required.

> >>
> >> The simplest one is KVM_DEV_VFIO_GROUP_ADD/DEL, that already exists and 
> >> also
> >> covers hot-unplug.  The second simplest one is KVM_DEV_IOASID_ADD/DEL.  
> > 
> > This isn't the same thing, this is back to trying to have the kernel
> > set policy for userspace.  
> 
> If you want a userspace policy then there would be three states:
> 
> * WBINVD enabled because a WBINVD-enabled VFIO device is attached.
> 
> * WBINVD potentially enabled but no WBINVD-enabled VFIO device attached
> 
> * WBINVD forcefully disabled
> 
> KVM_DEV_VFIO_GROUP_ADD/DEL can still be used to distinguish the first 
> two.  Due to backwards compatibility, those two describe the default 
> behavior; disabling wbinvd can be done easily with a new sub-ioctl of 
> KVM_ENABLE_CAP and doesn't require any security proof.

That seems like a good model, use the kvm-vfio device for the default
behavior and extend an existing KVM ioctl if QEMU still needs a way to
tell KVM to assume all DMA is coherent, regardless of what the kvm-vfio
device reports.

If feels like we should be able to support a backwards compatibility
mode using the vfio group, but I expect long term we'll want to
transition the kvm-vfio device from a groupfd to an ioasidfd.

> The meaning of WBINVD-enabled is "won't return -ENXIO for the wbinvd 
> ioctl", nothing more nothing less.  If all VFIO devices are going to be 
> WBINVD-enabled, then that will reflect on KVM as well, and I won't have 
> anything to object if there's consensus on the device assignment side of 
> things that the wbinvd ioctl won't ever fail.

If we create the IOMMU vs device coherency matrix:

\ Device supports
IOMMU blocks \   no-snoop
  no-snoop\  yes | no  |
---+-+-+
   yes |  1  |  2  |
---+-+-+
   no  |  3  |  4  |
---+-+-+

DMA is always coherent in boxes {1,2,4} (wbinvd emulation is not
needed).  VFIO will currently always configure the IOMMU for {1,2} when
the feature is supported.  Boxes {3,4} are where we'll currently
emulate wbinvd.  The best we could do, not knowing the guest or
insights into the guest driver would be to only emulate wbinvd for {3}.

The majority of devices appear to report no-snoop support {1,3}, but
the claim is that it's mostly unused outside of GPUs, effectively {2,4}.
I'll speculate that most IOMMUs support enforcing coherency (amd, arm,
fsl unconditionally, intel conditionally) {1,2}.

I think that means we're currently operating primarily in Box {1},
which does not seem to lean towards unconditional wbinvd access with
device ownership.

I think we have a desire with IOASID to allow user policy to operate
certain devices in {3} and I think the argument there is that a
specific software enforced coherence sync is more efficient on the bus
than the constant coherence enforcement by the IOMMU.

I think that the disable mode Jason proposed is essentially just a way
to move a device from {3} to {4}, ie. the IOASID support or
configuration does not block no-snoop and the device claims to support
no-snoop, but doesn't use it.  How we'd determine this to be true for
a device without a crystal ball of driver development or hardware
errata that no-snoop transactions are not possible regardless of the
behavior of the enable bit, I'm not sure.  If we're operating a device
in {3}, but the device does not generate no-snoop transactions, then
presumably the guest driver isn't generating wbinvd instructions for us
to emulate, so where are the wbinvd instructions that this feature
would prevent being emulated coming from?  Thanks,

Alex


Re: [RFC] /dev/ioasid uAPI proposal

2021-06-07 Thread Alex Williamson
On Mon, 7 Jun 2021 20:03:53 -0300
Jason Gunthorpe  wrote:

> On Mon, Jun 07, 2021 at 01:41:28PM -0600, Alex Williamson wrote:
> 
> > > Compatibility is important, but when I look in the kernel code I see
> > > very few places that call wbinvd(). Basically all DRM for something
> > > relavent to qemu.
> > > 
> > > That tells me that the vast majority of PCI devices do not generate
> > > no-snoop traffic.  
> > 
> > Unfortunately, even just looking at devices across a couple laptops
> > most devices do support and have NoSnoop+ set by default.
> 
> Yes, mine too, but that doesn't mean the device is issuing nosnoop
> transactions, it just means the OS is allowing it to do so if it wants.
> 
> As I said, without driver support the feature cannot be used, and
> there is no driver support in Linux outside DRM, unless it is
> hidden.. Certainly I've never run into it..
> 
> Even mlx5 is setting the nosnoop bit, but I have a fairly high
> confidence that we don't set the TLP bit for anything Linux does.
> 
> > It's not safe for QEMU to make an assumption that only GPUs will
> > actually make use of it.  
> 
> Not 100% safe, but if you know you are running Linux OS in the VM you
> can look at the drivers the devices need and make a determination.

QEMU doesn't know what guest it's running or what driver the guest is
using.  QEMU can only create safe configurations by default, the same
as done now using vfio.  Anything outside of that scope would require
experimental opt-in support by the user or a guarantee from the device
vendor that the device cannot ever (not just for the existing drivers)
create non-coherent TLPs. Thanks,

Alex

> > Yes, QEMU can reject a hot-unplug event, but then QEMU retains the
> > privilege that the device grants it.  Releasing the device and
> > retaining the privileged gained by it seems wrong.  Thanks,  
> 
> It is not completely ideal, but it is such a simplification, and I
> can't really see a drawback..
> 
> Jason
> 

___
iommu mailing list
iommu@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/iommu


Re: [RFC] /dev/ioasid uAPI proposal

2021-06-07 Thread Alex Williamson
On Mon, 7 Jun 2021 16:08:02 -0300
Jason Gunthorpe  wrote:

> On Mon, Jun 07, 2021 at 12:59:46PM -0600, Alex Williamson wrote:
> 
> > > It is up to qemu if it wants to proceed or not. There is no issue with
> > > allowing the use of no-snoop and blocking wbinvd, other than some
> > > drivers may malfunction. If the user is certain they don't have
> > > malfunctioning drivers then no issue to go ahead.  
> > 
> > A driver that knows how to use the device in a coherent way can
> > certainly proceed, but I suspect that's not something we can ask of
> > QEMU.  QEMU has no visibility to the in-use driver and sketchy ability
> > to virtualize the no-snoop enable bit to prevent non-coherent DMA from
> > the device.  There might be an experimental ("x-" prefixed) QEMU device
> > option to allow user override, but QEMU should disallow the possibility
> > of malfunctioning drivers by default.  If we have devices that probe as
> > supporting no-snoop, but actually can't generate such traffic, we might
> > need a quirk list somewhere.  
> 
> Compatibility is important, but when I look in the kernel code I see
> very few places that call wbinvd(). Basically all DRM for something
> relavent to qemu.
> 
> That tells me that the vast majority of PCI devices do not generate
> no-snoop traffic.

Unfortunately, even just looking at devices across a couple laptops
most devices do support and have NoSnoop+ set by default.  I don't
notice anything in the kernel that actually tries to set this enable (a
handful that actively disable), so I assume it's done by the firmware.
It's not safe for QEMU to make an assumption that only GPUs will
actually make use of it.

> > > I think it makes the software design much simpler if the security
> > > check is very simple. Possessing a suitable device in an ioasid fd
> > > container is enough to flip on the feature and we don't need to track
> > > changes from that point on. We don't need to revoke wbinvd if the
> > > ioasid fd changes, for instance. Better to keep the kernel very simple
> > > in this regard.  
> > 
> > You're suggesting that a user isn't forced to give up wbinvd emulation
> > if they lose access to their device?
> 
> Sure, why do we need to be stricter? It is the same logic I gave
> earlier, once an attacker process has access to wbinvd an attacker can
> just keep its access indefinitely.
> 
> The main use case for revokation assumes that qemu would be
> compromised after a device is hot-unplugged and you want to block off
> wbinvd. But I have a hard time seeing that as useful enough to justify
> all the complicated code to do it...

It's currently just a matter of the kvm-vfio device holding a reference
to the group so that it cannot be used elsewhere so long as it's being
used to elevate privileges on a given KVM instance.  If we conclude that
access to a device with the right capability is required to gain a
privilege, I don't really see how we can wave aside that the privilege
isn't lost with the device.

> For KVM qemu can turn on/off on hot plug events as it requires to give
> VM security. It doesn't need to rely on the kernel to control this.

Yes, QEMU can reject a hot-unplug event, but then QEMU retains the
privilege that the device grants it.  Releasing the device and
retaining the privileged gained by it seems wrong.  Thanks,

Alex

___
iommu mailing list
iommu@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/iommu


Re: [RFC] /dev/ioasid uAPI proposal

2021-06-07 Thread Alex Williamson
On Mon, 7 Jun 2021 15:18:58 -0300
Jason Gunthorpe  wrote:

> On Mon, Jun 07, 2021 at 09:41:48AM -0600, Alex Williamson wrote:
> > You're calling this an admin knob, which to me suggests a global module
> > option, so are you trying to implement both an administrator and a user
> > policy?  ie. the user can create scenarios where access to wbinvd might
> > be justified by hardware/IOMMU configuration, but can be limited by the
> > admin?  
> 
> Could be a per-device sysfs too. I'm not really sure what is useful
> here.
> 
> > For example I proposed that the ioasidfd would bear the responsibility
> > of a wbinvd ioctl and therefore validate the user's access to enable
> > wbinvd emulation w/ KVM, so I'm assuming this module option lives
> > there.
> 
> Right, this is what I was thinking
> 
> > What then is "automatic" mode?  The user cannot create a non-coherent
> > IOASID with a non-coherent device if the IOMMU supports no-snoop
> > blocking?  Do they get a failure?  Does it get silently promoted to
> > coherent?  
> 
> "automatic" was just a way to keep the API the same as today. Today if
> the IOMMU can block no-snoop then vfio disables wbinvd. To get the
> same level of security automatic mode would detect that vfio would
> have blocked wbinvd because the IOMMU can do it, and then always block
> it.
> 
> It makes sense if there is an admin knob, as the admin could then move
> to an explict enable/disable to get functionality they can't get
> today.
> 
> > In "disable" mode, I think we're just narrowing the restriction
> > further, a non-coherent capable device cannot be used except in a
> > forced coherent IOASID.  
> 
> I wouldn't say "cannot be used" - just you can't get access to
> wbinvd. 
> 
> It is up to qemu if it wants to proceed or not. There is no issue with
> allowing the use of no-snoop and blocking wbinvd, other than some
> drivers may malfunction. If the user is certain they don't have
> malfunctioning drivers then no issue to go ahead.

A driver that knows how to use the device in a coherent way can
certainly proceed, but I suspect that's not something we can ask of
QEMU.  QEMU has no visibility to the in-use driver and sketchy ability
to virtualize the no-snoop enable bit to prevent non-coherent DMA from
the device.  There might be an experimental ("x-" prefixed) QEMU device
option to allow user override, but QEMU should disallow the possibility
of malfunctioning drivers by default.  If we have devices that probe as
supporting no-snoop, but actually can't generate such traffic, we might
need a quirk list somewhere.

> The current vfio arrangement (automatic) maximized compatability. The
> enable/disable options provide for max performance and max security as
> alternative targets.
> 
> > > It is the strenth of Paolo's model that KVM should not be able to do
> > > optionally less, not more than the process itself can do.  
> > 
> > I think my previous reply was working towards those guidelines.  I feel
> > like we're mostly in agreement, but perhaps reading past each other.  
> 
> Yes, I think I said we were agreeing :)
> 
> > Nothing here convinced me against my previous proposal that the
> > ioasidfd bears responsibility for managing access to a wbinvd ioctl,
> > and therefore the equivalent KVM access.  Whether wbinvd is allowed or
> > no-op'd when the use has access to a non-coherent device in a
> > configuration where the IOMMU prevents non-coherent DMA is maybe still
> > a matter of personal preference.  
> 
> I think it makes the software design much simpler if the security
> check is very simple. Possessing a suitable device in an ioasid fd
> container is enough to flip on the feature and we don't need to track
> changes from that point on. We don't need to revoke wbinvd if the
> ioasid fd changes, for instance. Better to keep the kernel very simple
> in this regard.

You're suggesting that a user isn't forced to give up wbinvd emulation
if they lose access to their device?  I suspect that like we do today,
we'll want to re-evaluate the need for wbinvd on most device changes.
I think this is why the kvm-vfio device holds a vfio group reference;
to make sure a given group can't elevate privileges for multiple
processes.  Thanks,

Alex

___
iommu mailing list
iommu@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/iommu


Re: [RFC] /dev/ioasid uAPI proposal

2021-06-07 Thread Alex Williamson
On Fri, 4 Jun 2021 20:01:08 -0300
Jason Gunthorpe  wrote:

> On Fri, Jun 04, 2021 at 03:29:18PM -0600, Alex Williamson wrote:
> > On Fri, 4 Jun 2021 14:22:07 -0300
> > Jason Gunthorpe  wrote:
> >   
> > > On Fri, Jun 04, 2021 at 06:10:51PM +0200, Paolo Bonzini wrote:  
> > > > On 04/06/21 18:03, Jason Gunthorpe wrote:
> > > > > On Fri, Jun 04, 2021 at 05:57:19PM +0200, Paolo Bonzini wrote:
> > > > > > I don't want a security proof myself; I want to trust VFIO to make 
> > > > > > the right
> > > > > > judgment and I'm happy to defer to it (via the KVM-VFIO device).
> > > > > > 
> > > > > > Given how KVM is just a device driver inside Linux, VMs should be a 
> > > > > > slightly
> > > > > > more roundabout way to do stuff that is accessible to bare metal; 
> > > > > > not a way
> > > > > > to gain extra privilege.
> > > > > 
> > > > > Okay, fine, lets turn the question on its head then.
> > > > > 
> > > > > VFIO should provide a IOCTL VFIO_EXECUTE_WBINVD so that userspace VFIO
> > > > > application can make use of no-snoop optimizations. The ability of KVM
> > > > > to execute wbinvd should be tied to the ability of that IOCTL to run
> > > > > in a normal process context.
> > > > > 
> > > > > So, under what conditions do we want to allow VFIO to giave a process
> > > > > elevated access to the CPU:
> > > > 
> > > > Ok, I would definitely not want to tie it *only* to CAP_SYS_RAWIO (i.e.
> > > > #2+#3 would be worse than what we have today), but IIUC the proposal 
> > > > (was it
> > > > yours or Kevin's?) was to keep #2 and add #1 with an enable/disable 
> > > > ioctl,
> > > > which then would be on VFIO and not on KVM.  
> > > 
> > > At the end of the day we need an ioctl with two arguments:
> > >  - The 'security proof' FD (ie /dev/vfio/XX, or /dev/ioasid, or whatever)
> > >  - The KVM FD to control wbinvd support on
> > > 
> > > Philosophically it doesn't matter too much which subsystem that ioctl
> > > lives, but we have these obnoxious cross module dependencies to
> > > consider.. 
> > > 
> > > Framing the question, as you have, to be about the process, I think
> > > explains why KVM doesn't really care what is decided, so long as the
> > > process and the VM have equivalent rights.
> > > 
> > > Alex, how about a more fleshed out suggestion:
> > > 
> > >  1) When the device is attached to the IOASID via VFIO_ATTACH_IOASID
> > > it communicates its no-snoop configuration:  
> > 
> > Communicates to whom?  
> 
> To the /dev/iommu FD which will have to maintain a list of devices
> attached to it internally.
> 
> > >  - 0 enable, allow WBINVD
> > >  - 1 automatic disable, block WBINVD if the platform
> > >IOMMU can police it (what we do today)
> > >  - 2 force disable, do not allow BINVD ever  
> > 
> > The only thing we know about the device is whether or not Enable
> > No-snoop is hard wired to zero, ie. it either can't generate no-snoop
> > TLPs ("coherent-only") or it might ("assumed non-coherent").
> 
> Here I am outlining the choice an also imagining we might want an
> admin knob to select the three.

You're calling this an admin knob, which to me suggests a global module
option, so are you trying to implement both an administrator and a user
policy?  ie. the user can create scenarios where access to wbinvd might
be justified by hardware/IOMMU configuration, but can be limited by the
admin?

For example I proposed that the ioasidfd would bear the responsibility
of a wbinvd ioctl and therefore validate the user's access to enable
wbinvd emulation w/ KVM, so I'm assuming this module option lives
there.  I essentially described the "enable" behavior in my previous
reply, user has access to wbinvd if owning a non-coherent capable
device managed in a non-coherent IOASID.  Yes, the user IOASID
configuration controls the latter half of this.

What then is "automatic" mode?  The user cannot create a non-coherent
IOASID with a non-coherent device if the IOMMU supports no-snoop
blocking?  Do they get a failure?  Does it get silently promoted to
coherent?

In "disable" mode, I think we're just narrowing the restriction
further, a non-coherent capable device cannot be u

Re: [RFC] /dev/ioasid uAPI proposal

2021-06-04 Thread Alex Williamson
On Fri, 4 Jun 2021 09:13:37 -0300
Jason Gunthorpe  wrote:

> On Thu, Jun 03, 2021 at 02:41:36PM -0600, Alex Williamson wrote:
> 
> > Could you clarify "vfio_driver"?
> 
> This is the thing providing the vfio_device_ops function pointers.
> 
> So vfio-pci can't know anything about this (although your no-snoop
> control probing idea makes sense to me)
> 
> But vfio_mlx5_pci can know
> 
> So can mdev_idxd
> 
> And kvmgt

A capability on VFIO_DEVICE_GET_INFO could provide a hint to userspace.
Stock vfio-pci could fill it out to the extent advertising if the
device is capable of non-coherent DMA based on the Enable No-snoop
probing, the device specific vfio_drivers could set it based on
knowledge of the device behavior.  Another bit might indicate a
preference to not suppress non-coherent DMA at the IOMMU.  Thanks,

Alex

___
iommu mailing list
iommu@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/iommu


Re: [RFC] /dev/ioasid uAPI proposal

2021-06-04 Thread Alex Williamson
On Fri, 4 Jun 2021 14:22:07 -0300
Jason Gunthorpe  wrote:

> On Fri, Jun 04, 2021 at 06:10:51PM +0200, Paolo Bonzini wrote:
> > On 04/06/21 18:03, Jason Gunthorpe wrote:  
> > > On Fri, Jun 04, 2021 at 05:57:19PM +0200, Paolo Bonzini wrote:  
> > > > I don't want a security proof myself; I want to trust VFIO to make the 
> > > > right
> > > > judgment and I'm happy to defer to it (via the KVM-VFIO device).
> > > > 
> > > > Given how KVM is just a device driver inside Linux, VMs should be a 
> > > > slightly
> > > > more roundabout way to do stuff that is accessible to bare metal; not a 
> > > > way
> > > > to gain extra privilege.  
> > > 
> > > Okay, fine, lets turn the question on its head then.
> > > 
> > > VFIO should provide a IOCTL VFIO_EXECUTE_WBINVD so that userspace VFIO
> > > application can make use of no-snoop optimizations. The ability of KVM
> > > to execute wbinvd should be tied to the ability of that IOCTL to run
> > > in a normal process context.
> > > 
> > > So, under what conditions do we want to allow VFIO to giave a process
> > > elevated access to the CPU:  
> > 
> > Ok, I would definitely not want to tie it *only* to CAP_SYS_RAWIO (i.e.
> > #2+#3 would be worse than what we have today), but IIUC the proposal (was it
> > yours or Kevin's?) was to keep #2 and add #1 with an enable/disable ioctl,
> > which then would be on VFIO and not on KVM.
> 
> At the end of the day we need an ioctl with two arguments:
>  - The 'security proof' FD (ie /dev/vfio/XX, or /dev/ioasid, or whatever)
>  - The KVM FD to control wbinvd support on
> 
> Philosophically it doesn't matter too much which subsystem that ioctl
> lives, but we have these obnoxious cross module dependencies to
> consider.. 
> 
> Framing the question, as you have, to be about the process, I think
> explains why KVM doesn't really care what is decided, so long as the
> process and the VM have equivalent rights.
> 
> Alex, how about a more fleshed out suggestion:
> 
>  1) When the device is attached to the IOASID via VFIO_ATTACH_IOASID
> it communicates its no-snoop configuration:

Communicates to whom?

>  - 0 enable, allow WBINVD
>  - 1 automatic disable, block WBINVD if the platform
>IOMMU can police it (what we do today)
>  - 2 force disable, do not allow BINVD ever

The only thing we know about the device is whether or not Enable
No-snoop is hard wired to zero, ie. it either can't generate no-snoop
TLPs ("coherent-only") or it might ("assumed non-coherent").  If
we're putting the policy decision in the hands of userspace they should
have access to wbinvd if they own a device that is assumed
non-coherent AND it's attached to an IOMMU (page table) that is not
blocking no-snoop (a "non-coherent IOASID").

I think that means that the IOASID needs to be created (IOASID_ALLOC)
with a flag that specifies whether this address space is coherent
(IOASID_GET_INFO probably needs a flag/cap to expose if the system
supports this).  All mappings in this IOASID would use IOMMU_CACHE and
and devices attached to it would be required to be backed by an IOMMU
capable of IOMMU_CAP_CACHE_COHERENCY (attach fails otherwise).  If only
these IOASIDs exist, access to wbinvd would not be provided.  (How does
a user provided page table work? - reserved bit set, user error?)

Conversely, a user could create a non-coherent IOASID and attach any
device to it, regardless of IOMMU backing capabilities.  Only if an
assumed non-coherent device is attached would the wbinvd be allowed.

I think that means that an EXECUTE_WBINVD ioctl lives on the IOASIDFD
and the IOASID world needs to understand the device's ability to
generate non-coherent DMA.  This wbinvd ioctl would be a no-op (or
some known errno) unless a non-coherent IOASID exists with a potentially
non-coherent device attached.
 
> vfio_pci may want to take this from an admin configuration knob
> someplace. It allows the admin to customize if they want.
> 
> If we can figure out a way to autodetect 2 from vfio_pci, all the
> better
> 
>  2) There is some IOMMU_EXECUTE_WBINVD IOCTL that allows userspace
> to access wbinvd so it can make use of the no snoop optimization.
> 
> wbinvd is allowed when:
>   - A device is joined with mode #0
>   - A device is joined with mode #1 and the IOMMU cannot block
> no-snoop (today)
> 
>  3) The IOASID's don't care about this at all. If IOMMU_EXECUTE_WBINVD
> is blocked and userspace doesn't request to block no-snoop in the
> IOASID then it is a userspace error.

In my model above, the IOASID is central to this.
 
>  4) The KVM interface is the very simple enable/disable WBINVD.
> Possessing a FD that can do IOMMU_EXECUTE_WBINVD is required
> to enable WBINVD at KVM.

Right, and in the new world order, vfio is only a device driver, the
IOASID manages the device's DMA.  wbinvd is only necessary relative to
non-coherent DMA, which seems like QEMU needs to bump KVM with an
ioasidfd.
 
> It is pret

Re: [RFC] /dev/ioasid uAPI proposal

2021-06-04 Thread Alex Williamson
On Fri, 4 Jun 2021 09:19:50 +
"Tian, Kevin"  wrote:

> > From: Alex Williamson 
> > Sent: Friday, June 4, 2021 4:42 AM
> >   
> > > 'qemu --allow-no-snoop' makes more sense to me  
> > 
> > I'd be tempted to attach it to the -device vfio-pci option, it's
> > specific drivers for specific devices that are going to want this and
> > those devices may not be permanently attached to the VM.  But I see in
> > the other thread you're trying to optimize IOMMU page table sharing.
> > 
> > There's a usability question in either case though and I'm not sure how
> > to get around it other than QEMU or the kernel knowing a list of
> > devices (explicit IDs or vendor+class) to select per device defaults.
> >   
> 
> "-device vfio-pci" is a per-device option, which implies that the
> no-snoop choice is given to the admin then no need to maintain 
> a fixed device list in Qemu?

I think we want to look at where we put it to have the best default
user experience.  For example the QEMU vfio-pci device option could use
on/off/auto semantics where auto is the default and QEMU maintains a
list of IDs or vendor/class configurations where we've determined the
"optimal" auto configuration.  Management tools could provide an
override, but we're imposing some pretty technical requirements for a
management tool to be able to come up with good per device defaults.
Seems like we should consolidate that technical decision in one place.
Thanks,

Alex

___
iommu mailing list
iommu@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/iommu


Re: [RFC] /dev/ioasid uAPI proposal

2021-06-04 Thread Alex Williamson
[Cc +Paolo]

On Fri, 4 Jun 2021 09:28:30 -0300
Jason Gunthorpe  wrote:

> On Fri, Jun 04, 2021 at 08:38:26AM +, Tian, Kevin wrote:
> > > I think more to drive the replacement design; if we can't figure out
> > > how to do something other than backwards compatibility trickery in the
> > > kernel, it's probably going to bite us.  Thanks,  
> > 
> > I'm a bit lost on the desired flow in your minds. Here is one flow based
> > on my understanding of this discussion. Please comment whether it
> > matches your thinking:
> > 
> > 0) ioasid_fd is created and registered to KVM via KVM_ADD_IOASID_FD;
> > 
> > 1) Qemu binds dev1 to ioasid_fd;
> > 
> > 2) Qemu calls IOASID_GET_DEV_INFO for dev1. This will carry IOMMU_
> >  CACHE info i.e. whether underlying IOMMU can enforce snoop;
> > 
> > 3) Qemu plans to create a gpa_ioasid, and attach dev1 to it. Here Qemu
> > needs to figure out whether dev1 wants to do no-snoop. This might
> > be based a fixed vendor/class list or specified by user;
> > 
> > 4) gpa_ioasid = ioctl(ioasid_fd, IOASID_ALLOC); At this point a 'snoop'
> >  flag is specified to decide the page table format, which is supposed
> >  to match dev1;  
> 
> > 5) Qemu attaches dev1 to gpa_ioasid via VFIO_ATTACH_IOASID. At this 
> >  point, specify snoop/no-snoop again. If not supported by related 
> >  iommu or different from what gpa_ioasid has, attach fails.  
> 
> Why do we need to specify it again?

My thought as well.

> If the IOASID was created with the "block no-snoop" flag then it is
> blocked in that IOASID, and that blocking sets the page table format.
> 
> The only question is if we can successfully attach a device to the
> page table, or not.
> 
> The KVM interface is a bit tricky because Alex said this is partially
> security, wbinvd is only enabled if someone has a FD to a device that
> can support no-snoop. 
> 
> Personally I think this got way too complicated, the KVM interface
> should simply be
> 
> ioctl(KVM_ALLOW_INCOHERENT_DMA, ioasidfd, device_label)
> ioctl(KVM_DISALLOW_INCOHERENT_DMA, ioasidfd, device_label)
> 
> and let qemu sort it out based on command flags, detection, whatever.
> 
> 'ioasidfd, device_label' is the security proof that Alex asked
> for. This needs to be some device in the ioasidfd that declares it is
> capabale of no-snoop. Eg vfio_pci would always declare it is capable
> of no-snoop.
> 
> No kernel call backs, no kernel auto-sync/etc. If qemu mismatches the
> IOASID block no-snoop flag with the KVM_x_INCOHERENT_DMA state then it
> is just a kernel-harmless uerspace bug.
> 
> Then user space can decide which of the various axis's it wants to
> optimize for.

Let's make sure the KVM folks are part of this decision; a re-cap for
them, KVM currently automatically enables wbinvd emulation when
potentially non-coherent devices are present which is determined solely
based on the IOMMU's (or platform's, as exposed via the IOMMU) ability
to essentially force no-snoop transactions from a device to be cache
coherent.  This synchronization is triggered via the kvm-vfio device,
where QEMU creates the device and adds/removes vfio group fd
descriptors as an additionally layer to prevent the user from enabling
wbinvd emulation on a whim.

IIRC, this latter association was considered a security/DoS issue to
prevent a malicious guest/userspace from creating a disproportionate
system load.

Where would KVM stand on allowing more direct userspace control of
wbinvd behavior?  Would arbitrary control be acceptable or should we
continue to require it only in association to a device requiring it for
correct operation.

A wrinkle in "correct operation" is that while the IOMMU may be able to
force no-snoop transactions to be coherent, in the scenario described
in the previous reply, the user may intend to use non-coherent DMA
regardless of the IOMMU capabilities due to their own optimization
policy.  There's a whole spectrum here, including aspects we can't
determine around the device driver's intentions to use non-coherent
transactions, the user's policy in trading hypervisor overhead for
cache coherence overhead, etc.  Thanks,

Alex

___
iommu mailing list
iommu@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/iommu


Re: [RFC] /dev/ioasid uAPI proposal

2021-06-03 Thread Alex Williamson
On Thu, 3 Jun 2021 17:10:18 -0300
Jason Gunthorpe  wrote:

> On Thu, Jun 03, 2021 at 02:01:46PM -0600, Alex Williamson wrote:
> 
> > > > > 1) Mixing IOMMU_CAP_CACHE_COHERENCY and !IOMMU_CAP_CACHE_COHERENCY
> > > > >domains.
> > > > > 
> > > > >This doesn't actually matter. If you mix them together then kvm
> > > > >will turn on wbinvd anyhow, so we don't need to use the DMA_PTE_SNP
> > > > >anywhere in this VM.
> > > > > 
> > > > >This if two IOMMU's are joined together into a single /dev/ioasid
> > > > >then we can just make them both pretend to be
> > > > >!IOMMU_CAP_CACHE_COHERENCY and both not set IOMMU_CACHE.
> > > > 
> > > > Yes and no.  Yes, if any domain is !IOMMU_CAP_CACHE_COHERENCY then we
> > > > need to emulate wbinvd, but no we'll use IOMMU_CACHE any time it's
> > > > available based on the per domain support available.  That gives us the
> > > > most consistent behavior, ie. we don't have VMs emulating wbinvd
> > > > because they used to have a device attached where the domain required
> > > > it and we can't atomically remap with new flags to perform the same as
> > > > a VM that never had that device attached in the first place.
> > > 
> > > I think we are saying the same thing..  
> > 
> > Hrm?  I think I'm saying the opposite of your "both not set
> > IOMMU_CACHE".  IOMMU_CACHE is the mapping flag that enables
> > DMA_PTE_SNP.  Maybe you're using IOMMU_CACHE as the state reported to
> > KVM?  
> 
> I'm saying if we enable wbinvd in the guest then no IOASIDs used by
> that guest need to set DMA_PTE_SNP.

Yes

> If we disable wbinvd in the guest
> then all IOASIDs must enforce DMA_PTE_SNP (or we otherwise guarentee
> no-snoop is not possible).

Yes, but we can't get from one of these to the other atomically wrt to
the device DMA.

> This is not what VFIO does today, but it is a reasonable choice.
> 
> Based on that observation we can say as soon as the user wants to use
> an IOMMU that does not support DMA_PTE_SNP in the guest we can still
> share the IO page table with IOMMUs that do support DMA_PTE_SNP.

If your goal is to prioritize IO page table sharing, sure.  But because
we cannot atomically transition from one to the other, each device is
stuck with the pages tables it has, so the history of the VM becomes a
factor in the performance characteristics.

For example if device {A} is backed by an IOMMU capable of blocking
no-snoop and device {B} is backed by an IOMMU which cannot block
no-snoop, then booting VM1 with {A,B} and later removing device {B}
would result in ongoing wbinvd emulation versus a VM2 only booted with
{A}.

Type1 would use separate IO page tables (domains/ioasids) for these such
that VM1 and VM2 have the same characteristics at the end.

Does this become user defined policy in the IOASID model?  There's
quite a mess of exposing sufficient GET_INFO for an IOASID for the user
to know such properties of the IOMMU, plus maybe we need mapping flags
equivalent to IOMMU_CACHE exposed to the user, preventing sharing an
IOASID that could generate IOMMU faults, etc.

> > > It doesn't solve the problem to connect kvm to AP and kvmgt though  
> > 
> > It does not, we'll probably need a vfio ioctl to gratuitously announce
> > the KVM fd to each device.  I think some devices might currently fail
> > their open callback if that linkage isn't already available though, so
> > it's not clear when that should happen, ie. it can't currently be a
> > VFIO_DEVICE ioctl as getting the device fd requires an open, but this
> > proposal requires some availability of the vfio device fd without any
> > setup, so presumably that won't yet call the driver open callback.
> > Maybe that's part of the attach phase now... I'm not sure, it's not
> > clear when the vfio device uAPI starts being available in the process
> > of setting up the ioasid.  Thanks,  
> 
> At a certain point we maybe just have to stick to backward compat, I
> think. Though it is useful to think about green field alternates to
> try to guide the backward compat design..

I think more to drive the replacement design; if we can't figure out
how to do something other than backwards compatibility trickery in the
kernel, it's probably going to bite us.  Thanks,

Alex

___
iommu mailing list
iommu@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/iommu


Re: [RFC] /dev/ioasid uAPI proposal

2021-06-03 Thread Alex Williamson
On Thu, 3 Jun 2021 09:40:36 -0300
Jason Gunthorpe  wrote:

> On Thu, Jun 03, 2021 at 03:22:27AM +, Tian, Kevin wrote:
> > > From: Alex Williamson 
> > > Sent: Thursday, June 3, 2021 10:51 AM
> > > 
> > > On Wed, 2 Jun 2021 19:45:36 -0300
> > > Jason Gunthorpe  wrote:
> > >   
> > > > On Wed, Jun 02, 2021 at 02:37:34PM -0600, Alex Williamson wrote:
> > > >  
> > > > > Right.  I don't follow where you're jumping to relaying DMA_PTE_SNP
> > > > > from the guest page table... what page table?  
> > > >
> > > > I see my confusion now, the phrasing in your earlier remark led me
> > > > think this was about allowing the no-snoop performance enhancement in
> > > > some restricted way.
> > > >
> > > > It is really about blocking no-snoop 100% of the time and then
> > > > disabling the dangerous wbinvd when the block is successful.
> > > >
> > > > Didn't closely read the kvm code :\
> > > >
> > > > If it was about allowing the optimization then I'd expect the guest to
> > > > enable no-snoopable regions via it's vIOMMU and realize them to the
> > > > hypervisor and plumb the whole thing through. Hence my remark about
> > > > the guest page tables..
> > > >
> > > > So really the test is just 'were we able to block it' ?  
> > > 
> > > Yup.  Do we really still consider that there's some performance benefit
> > > to be had by enabling a device to use no-snoop?  This seems largely a
> > > legacy thing.  
> > 
> > Yes, there is indeed performance benefit for device to use no-snoop,
> > e.g. 8K display and some imaging processing path, etc. The problem is
> > that the IOMMU for such devices is typically a different one from the
> > default IOMMU for most devices. This special IOMMU may not have
> > the ability of enforcing snoop on no-snoop PCI traffic then this fact
> > must be understood by KVM to do proper mtrr/pat/wbinvd virtualization 
> > for such devices to work correctly.  
> 
> Or stated another way:
> 
> We in Linux don't have a way to control if the VFIO IO page table will
> be snoop or no snoop from userspace so Intel has forced the platform's
> IOMMU path for the integrated GPU to be unable to enforce snoop, thus
> "solving" the problem.

That's giving vfio a lot of credit for influencing VT-d design.

> I don't think that is sustainable in the oveall ecosystem though.

Our current behavior is a reasonable default IMO, but I agree more
control will probably benefit us in the long run.

> 'qemu --allow-no-snoop' makes more sense to me

I'd be tempted to attach it to the -device vfio-pci option, it's
specific drivers for specific devices that are going to want this and
those devices may not be permanently attached to the VM.  But I see in
the other thread you're trying to optimize IOMMU page table sharing.

There's a usability question in either case though and I'm not sure how
to get around it other than QEMU or the kernel knowing a list of
devices (explicit IDs or vendor+class) to select per device defaults.

> > When discussing I/O page fault support in another thread, the consensus
> > is that an device handle will be registered (by user) or allocated (return
> > to user) in /dev/ioasid when binding the device to ioasid fd. From this 
> > angle we can register {ioasid_fd, device_handle} to KVM and then call 
> > something like ioasidfd_device_is_coherent() to get the property. 
> > Anyway the coherency is a per-device property which is not changed 
> > by how many I/O page tables are attached to it.  
> 
> It is not device specific, it is driver specific
> 
> As I said before, the question is if the IOASID itself can enforce
> snoop, or not. AND if the device will issue no-snoop or not.
> 
> Devices that are hard wired to never issue no-snoop are safe even with
> an IOASID that cannot enforce snoop. AFAIK really only GPUs use this
> feature. Eg I would be comfortable to say mlx5 never uses the no-snoop
> TLP flag.
> 
> Only the vfio_driver could know this.

Could you clarify "vfio_driver"?  The existing vfio-pci driver can't
know this, beyond perhaps probing if the Enable No-snoop bit is
hardwired to zero.  It's the driver running on top of vfio that
ultimately controls whether a capable device actually issues no-snoop
TLPs, but that can't be known to us.  A vendor variant of vfio-pci
might certainly know more about how its device is used by those
userspace/VM drivers.  Thanks,

Alex

___
iommu mailing list
iommu@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/iommu


Re: [RFC] /dev/ioasid uAPI proposal

2021-06-03 Thread Alex Williamson
On Thu, 3 Jun 2021 09:34:01 -0300
Jason Gunthorpe  wrote:

> On Wed, Jun 02, 2021 at 08:50:54PM -0600, Alex Williamson wrote:
> > On Wed, 2 Jun 2021 19:45:36 -0300
> > Jason Gunthorpe  wrote:
> >   
> > > On Wed, Jun 02, 2021 at 02:37:34PM -0600, Alex Williamson wrote:
> > >   
> > > > Right.  I don't follow where you're jumping to relaying DMA_PTE_SNP
> > > > from the guest page table... what page table?  
> > > 
> > > I see my confusion now, the phrasing in your earlier remark led me
> > > think this was about allowing the no-snoop performance enhancement in
> > > some restricted way.
> > > 
> > > It is really about blocking no-snoop 100% of the time and then
> > > disabling the dangerous wbinvd when the block is successful.
> > > 
> > > Didn't closely read the kvm code :\
> > > 
> > > If it was about allowing the optimization then I'd expect the guest to
> > > enable no-snoopable regions via it's vIOMMU and realize them to the
> > > hypervisor and plumb the whole thing through. Hence my remark about
> > > the guest page tables..
> > > 
> > > So really the test is just 'were we able to block it' ?  
> > 
> > Yup.  Do we really still consider that there's some performance benefit
> > to be had by enabling a device to use no-snoop?  This seems largely a
> > legacy thing.  
> 
> I've recently had some no-snoopy discussions lately.. The issue didn't
> vanish, it is still expensive going through all that cache hardware.
> 
> > > But Ok, back the /dev/ioasid. This answers a few lingering questions I
> > > had..
> > > 
> > > 1) Mixing IOMMU_CAP_CACHE_COHERENCY and !IOMMU_CAP_CACHE_COHERENCY
> > >domains.
> > > 
> > >This doesn't actually matter. If you mix them together then kvm
> > >will turn on wbinvd anyhow, so we don't need to use the DMA_PTE_SNP
> > >anywhere in this VM.
> > > 
> > >This if two IOMMU's are joined together into a single /dev/ioasid
> > >then we can just make them both pretend to be
> > >!IOMMU_CAP_CACHE_COHERENCY and both not set IOMMU_CACHE.  
> > 
> > Yes and no.  Yes, if any domain is !IOMMU_CAP_CACHE_COHERENCY then we
> > need to emulate wbinvd, but no we'll use IOMMU_CACHE any time it's
> > available based on the per domain support available.  That gives us the
> > most consistent behavior, ie. we don't have VMs emulating wbinvd
> > because they used to have a device attached where the domain required
> > it and we can't atomically remap with new flags to perform the same as
> > a VM that never had that device attached in the first place.  
> 
> I think we are saying the same thing..

Hrm?  I think I'm saying the opposite of your "both not set
IOMMU_CACHE".  IOMMU_CACHE is the mapping flag that enables
DMA_PTE_SNP.  Maybe you're using IOMMU_CACHE as the state reported to
KVM?

> > > 2) How to fit this part of kvm in some new /dev/ioasid world
> > > 
> > >What we want to do here is iterate over every ioasid associated
> > >with the group fd that is passed into kvm.  
> > 
> > Yeah, we need some better names, binding a device to an ioasid (fd) but
> > then attaching a device to an allocated ioasid (non-fd)... I assume
> > you're talking about the latter ioasid.  
> 
> Fingers crossed on RFCv2.. Here I mean the IOASID object inside the
> /dev/iommu FD. The vfio_device would have some kref handle to the
> in-kernel representation of it. So we can interact with it..
> 
> > >Or perhaps more directly: an op attaching the vfio_device to the
> > >kvm and having some simple helper 
> > >  '(un)register ioasid with kvm (kvm, ioasid)'
> > >that the vfio_device driver can call that just sorts this out.  
> >
> > We could almost eliminate the device notion altogether here, use an
> > ioasidfd_for_each_ioasid() but we really want a way to trigger on each
> > change to the composition of the device set for the ioasid, which is
> > why we currently do it on addition or removal of a group, where the
> > group has a consistent set of IOMMU properties.  
> 
> That is another quite good option, just forget about trying to be
> highly specific and feed in the /dev/ioasid FD and have kvm ask "does
> anything in here not enforce snoop?"
> 
> With something appropriate to track/block changing that answer.
> 
> It doesn't so

Re: [RFC] /dev/ioasid uAPI proposal

2021-06-02 Thread Alex Williamson
On Thu, 3 Jun 2021 03:22:27 +
"Tian, Kevin"  wrote:

> > From: Alex Williamson 
> > Sent: Thursday, June 3, 2021 10:51 AM
> > 
> > On Wed, 2 Jun 2021 19:45:36 -0300
> > Jason Gunthorpe  wrote:
> >   
> > > On Wed, Jun 02, 2021 at 02:37:34PM -0600, Alex Williamson wrote:
> > >  
> > > > Right.  I don't follow where you're jumping to relaying DMA_PTE_SNP
> > > > from the guest page table... what page table?  
> > >
> > > I see my confusion now, the phrasing in your earlier remark led me
> > > think this was about allowing the no-snoop performance enhancement in
> > > some restricted way.
> > >
> > > It is really about blocking no-snoop 100% of the time and then
> > > disabling the dangerous wbinvd when the block is successful.
> > >
> > > Didn't closely read the kvm code :\
> > >
> > > If it was about allowing the optimization then I'd expect the guest to
> > > enable no-snoopable regions via it's vIOMMU and realize them to the
> > > hypervisor and plumb the whole thing through. Hence my remark about
> > > the guest page tables..
> > >
> > > So really the test is just 'were we able to block it' ?  
> > 
> > Yup.  Do we really still consider that there's some performance benefit
> > to be had by enabling a device to use no-snoop?  This seems largely a
> > legacy thing.  
> 
> Yes, there is indeed performance benefit for device to use no-snoop,
> e.g. 8K display and some imaging processing path, etc. The problem is
> that the IOMMU for such devices is typically a different one from the
> default IOMMU for most devices. This special IOMMU may not have
> the ability of enforcing snoop on no-snoop PCI traffic then this fact
> must be understood by KVM to do proper mtrr/pat/wbinvd virtualization 
> for such devices to work correctly.

The case where the IOMMU does not support snoop-control for such a
device already works fine, we can't prevent no-snoop so KVM will
emulate wbinvd.  The harder one is if we should opt to allow no-snoop
even if the IOMMU does support snoop-control.
 
> > > > This support existed before mdev, IIRC we needed it for direct
> > > > assignment of NVIDIA GPUs.  
> > >
> > > Probably because they ignored the disable no-snoop bits in the control
> > > block, or reset them in some insane way to "fix" broken bioses and
> > > kept using it even though by all rights qemu would have tried hard to
> > > turn it off via the config space. Processing no-snoop without a
> > > working wbinvd would be fatal. Yeesh
> > >
> > > But Ok, back the /dev/ioasid. This answers a few lingering questions I
> > > had..
> > >
> > > 1) Mixing IOMMU_CAP_CACHE_COHERENCY  
> > and !IOMMU_CAP_CACHE_COHERENCY  
> > >domains.
> > >
> > >This doesn't actually matter. If you mix them together then kvm
> > >will turn on wbinvd anyhow, so we don't need to use the DMA_PTE_SNP
> > >anywhere in this VM.
> > >
> > >This if two IOMMU's are joined together into a single /dev/ioasid
> > >then we can just make them both pretend to be
> > >!IOMMU_CAP_CACHE_COHERENCY and both not set IOMMU_CACHE.  
> > 
> > Yes and no.  Yes, if any domain is !IOMMU_CAP_CACHE_COHERENCY then
> > we
> > need to emulate wbinvd, but no we'll use IOMMU_CACHE any time it's
> > available based on the per domain support available.  That gives us the
> > most consistent behavior, ie. we don't have VMs emulating wbinvd
> > because they used to have a device attached where the domain required
> > it and we can't atomically remap with new flags to perform the same as
> > a VM that never had that device attached in the first place.
> >   
> > > 2) How to fit this part of kvm in some new /dev/ioasid world
> > >
> > >What we want to do here is iterate over every ioasid associated
> > >with the group fd that is passed into kvm.  
> > 
> > Yeah, we need some better names, binding a device to an ioasid (fd) but
> > then attaching a device to an allocated ioasid (non-fd)... I assume
> > you're talking about the latter ioasid.
> >   
> > >Today the group fd has a single container which specifies the
> > >single ioasid so this is being done trivially.
> > >
> > >To reorg we want to get the ioasid from the device not the
> > >group (see my note to David about the groups vs dev

Re: [RFC] /dev/ioasid uAPI proposal

2021-06-02 Thread Alex Williamson
On Wed, 2 Jun 2021 19:45:36 -0300
Jason Gunthorpe  wrote:

> On Wed, Jun 02, 2021 at 02:37:34PM -0600, Alex Williamson wrote:
> 
> > Right.  I don't follow where you're jumping to relaying DMA_PTE_SNP
> > from the guest page table... what page table?
> 
> I see my confusion now, the phrasing in your earlier remark led me
> think this was about allowing the no-snoop performance enhancement in
> some restricted way.
> 
> It is really about blocking no-snoop 100% of the time and then
> disabling the dangerous wbinvd when the block is successful.
> 
> Didn't closely read the kvm code :\
> 
> If it was about allowing the optimization then I'd expect the guest to
> enable no-snoopable regions via it's vIOMMU and realize them to the
> hypervisor and plumb the whole thing through. Hence my remark about
> the guest page tables..
> 
> So really the test is just 'were we able to block it' ?

Yup.  Do we really still consider that there's some performance benefit
to be had by enabling a device to use no-snoop?  This seems largely a
legacy thing.

> > This support existed before mdev, IIRC we needed it for direct
> > assignment of NVIDIA GPUs.  
> 
> Probably because they ignored the disable no-snoop bits in the control
> block, or reset them in some insane way to "fix" broken bioses and
> kept using it even though by all rights qemu would have tried hard to
> turn it off via the config space. Processing no-snoop without a
> working wbinvd would be fatal. Yeesh
> 
> But Ok, back the /dev/ioasid. This answers a few lingering questions I
> had..
> 
> 1) Mixing IOMMU_CAP_CACHE_COHERENCY and !IOMMU_CAP_CACHE_COHERENCY
>domains.
> 
>This doesn't actually matter. If you mix them together then kvm
>will turn on wbinvd anyhow, so we don't need to use the DMA_PTE_SNP
>anywhere in this VM.
> 
>This if two IOMMU's are joined together into a single /dev/ioasid
>then we can just make them both pretend to be
>!IOMMU_CAP_CACHE_COHERENCY and both not set IOMMU_CACHE.

Yes and no.  Yes, if any domain is !IOMMU_CAP_CACHE_COHERENCY then we
need to emulate wbinvd, but no we'll use IOMMU_CACHE any time it's
available based on the per domain support available.  That gives us the
most consistent behavior, ie. we don't have VMs emulating wbinvd
because they used to have a device attached where the domain required
it and we can't atomically remap with new flags to perform the same as
a VM that never had that device attached in the first place.

> 2) How to fit this part of kvm in some new /dev/ioasid world
> 
>What we want to do here is iterate over every ioasid associated
>with the group fd that is passed into kvm.

Yeah, we need some better names, binding a device to an ioasid (fd) but
then attaching a device to an allocated ioasid (non-fd)... I assume
you're talking about the latter ioasid.

>Today the group fd has a single container which specifies the
>single ioasid so this is being done trivially.
> 
>To reorg we want to get the ioasid from the device not the
>group (see my note to David about the groups vs device rational)
> 
>This is just iterating over each vfio_device in the group and
>querying the ioasid it is using.

The IOMMU API group interfaces is largely iommu_group_for_each_dev()
anyway, we still need to account for all the RIDs and aliases of a
group.

>Or perhaps more directly: an op attaching the vfio_device to the
>kvm and having some simple helper 
>  '(un)register ioasid with kvm (kvm, ioasid)'
>that the vfio_device driver can call that just sorts this out.

We could almost eliminate the device notion altogether here, use an
ioasidfd_for_each_ioasid() but we really want a way to trigger on each
change to the composition of the device set for the ioasid, which is
why we currently do it on addition or removal of a group, where the
group has a consistent set of IOMMU properties.  Register a notifier
callback via the ioasidfd?  Thanks,

Alex

___
iommu mailing list
iommu@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/iommu


Re: [RFC] /dev/ioasid uAPI proposal

2021-06-02 Thread Alex Williamson
On Wed, 2 Jun 2021 16:54:04 -0300
Jason Gunthorpe  wrote:

> On Wed, Jun 02, 2021 at 01:00:53PM -0600, Alex Williamson wrote:
> > 
> > Right, the device can generate the no-snoop transactions, but it's the
> > IOMMU that essentially determines whether those transactions are
> > actually still cache coherent, AIUI.  
> 
> Wow, this is really confusing stuff in the code.
> 
> At the PCI level there is a TLP bit called no-snoop that is platform
> specific. The general intention is to allow devices to selectively
> bypass the CPU caching for DMAs. GPUs like to use this feature for
> performance.

Yes

> I assume there is some exciting security issues here. Looks like
> allowing cache bypass does something bad inside VMs? Looks like
> allowing the VM to use the cache clear instruction that is mandatory
> with cache bypass DMA causes some QOS issues? OK.

IIRC, largely a DoS issue if userspace gets to choose when to emulate
wbinvd rather than it being demanded for correct operation.

> So how does it work?
> 
> What I see in the intel/iommu.c is that some domains support "snoop
> control" or not, based on some HW flag. This indicates if the
> DMA_PTE_SNP bit is supported on a page by page basis or not.
> 
> Since x86 always leans toward "DMA cache coherent" I'm reading some
> tea leaves here:
> 
>   IOMMU_CAP_CACHE_COHERENCY,  /* IOMMU can enforce cache coherent DMA
>  transactions */
> 
> And guessing that IOMMUs that implement DMA_PTE_SNP will ignore the
> snoop bit in TLPs for IOVA's that have DMA_PTE_SNP set?

That's my understanding as well.

> Further, I guess IOMMUs that don't support PTE_SNP, or have
> DMA_PTE_SNP clear will always honour the snoop bit. (backwards compat
> and all)

Yes.

> So, IOMMU_CAP_CACHE_COHERENCY does not mean the IOMMU is DMA
> incoherent with the CPU caches, it just means that that snoop bit in
> the TLP cannot be enforced. ie the device *could* do no-shoop DMA
> if it wants. Devices that never do no-snoop remain DMA coherent on
> x86, as they always have been.

Yes, IOMMU_CAP_CACHE_COHERENCY=false means we cannot force the device
DMA to be coherent via the IOMMU.

> IOMMU_CACHE does not mean the IOMMU is DMA cache coherent, it means
> the PCI device is blocked from using no-snoop in its TLPs.
> 
> I wonder if ARM implemented this consistently? I see VDPA is
> confused.. I was confused. What a terrible set of names.
> 
> In VFIO generic code I see it always sets IOMMU_CACHE:
> 
> if (iommu_capable(bus, IOMMU_CAP_CACHE_COHERENCY))
> domain->prot |= IOMMU_CACHE;
> 
> And thus also always provides IOMMU_CACHE to iommu_map:
> 
> ret = iommu_map(d->domain, iova, (phys_addr_t)pfn << 
> PAGE_SHIFT,
> npage << PAGE_SHIFT, prot | d->prot);
> 
> So when the IOMMU supports the no-snoop blocking security feature VFIO
> turns it on and blocks no-snoop to all pages? Ok..

Yep, I'd forgotten this nuance that we need to enable it via the
mapping flags.

> But I must be missing something big because *something* in the IOVA
> map should work with no-snoopable DMA, right? Otherwise what is the
> point of exposing the invalidate instruction to the guest?
> 
> I would think userspace should be relaying the DMA_PTE_SNP bit from
> the guest's page tables up to here??
> 
> The KVM hookup is driven by IOMMU_CACHE which is driven by
> IOMMU_CAP_CACHE_COHERENCY. So we turn on the special KVM support only
> if the IOMMU can block the SNP bit? And then we map all the pages to
> block the snoop bit? Huh?

Right.  I don't follow where you're jumping to relaying DMA_PTE_SNP
from the guest page table... what page table?  We don't necessarily
have a vIOMMU to expose such things, I don't think it even existed when
this we added.  Essentially if we can ignore no-snoop at the IOMMU,
then KVM doesn't need to worry about emulating wbinvd because of an
assigned device, whether that device uses it or not.  Win-win.

> Your explanation makes perfect sense: Block guests from using the
> dangerous cache invalidate instruction unless a device that uses
> no-snoop is plugged in. Block devices from using no-snoop because
> something about it is insecure. Ok.

No-snoop itself is not insecure, but to support no-snoop in a VM KVM
can't ignore wbinvd, which has overhead and abuse implications.

> But the conditions I'm looking for "device that uses no-snoop" is:
>  - The device will issue no-snoop TLPs at all

We can't really know this generically.  We can try to set the enable
bit to see if the device is capable of no-snoop, but that d

Re: [RFC] /dev/ioasid uAPI proposal

2021-06-02 Thread Alex Williamson
On Wed, 2 Jun 2021 15:09:25 -0300
Jason Gunthorpe  wrote:

> On Wed, Jun 02, 2021 at 12:01:11PM -0600, Alex Williamson wrote:
> > On Wed, 2 Jun 2021 14:35:10 -0300
> > Jason Gunthorpe  wrote:
> >   
> > > On Wed, Jun 02, 2021 at 11:11:17AM -0600, Alex Williamson wrote:
> > >   
> > > > > > > present and be able to test if DMA for that device is cache
> > > > > > > coherent.  
> > > > > 
> > > > > Why is this such a strong linkage to VFIO and not just a 'hey kvm
> > > > > emulate wbinvd' flag from qemu?
> > > > 
> > > > IIRC, wbinvd has host implications, a malicious user could tell KVM to
> > > > emulate wbinvd then run the op in a loop and induce a disproportionate
> > > > load on the system.  We therefore wanted a way that it would only be
> > > > enabled when required.
> > > 
> > > I think the non-coherentness is vfio_device specific? eg a specific
> > > device will decide if it is coherent or not?  
> > 
> > No, this is specifically whether DMA is cache coherent to the
> > processor, ie. in the case of wbinvd whether the processor needs to
> > invalidate its cache in order to see data from DMA.  
> 
> I'm confused. This is x86, all DMA is cache coherent unless the device
> is doing something special.
> 
> > > If yes I'd recast this to call kvm_arch_register_noncoherent_dma()
> > > from the VFIO_GROUP_NOTIFY_SET_KVM in the struct vfio_device
> > > implementation and not link it through the IOMMU.  
> > 
> > The IOMMU tells us if DMA is cache coherent, VFIO_DMA_CC_IOMMU maps to
> > IOMMU_CAP_CACHE_COHERENCY for all domains within a container.  
> 
> And this special IOMMU mode is basically requested by the device
> driver, right? Because if you use this mode you have to also use
> special programming techniques.
> 
> This smells like all the "snoop bypass" stuff from PCIE (for GPUs
> even) in a different guise - it is device triggered, not platform
> triggered behavior.

Right, the device can generate the no-snoop transactions, but it's the
IOMMU that essentially determines whether those transactions are
actually still cache coherent, AIUI.

I did experiment with virtually hardwiring the Enable No-Snoop bit in
the Device Control Register to zero, which would be generically allowed
by the PCIe spec, but then we get into subtle dependencies in the device
drivers and clearing the bit again after any sort of reset and the
backdoor accesses to config space which exist mostly in the class of
devices that might use no-snoop transactions (yes, GPUs suck).

It was much easier and more robust to ignore the device setting and rely
on the IOMMU behavior.  Yes, maybe we sometimes emulate wbinvd for VMs
where the device doesn't support no-snoop, but it seemed like platforms
were headed in this direction where no-snoop was ignored anyway.
Thanks,

Alex

___
iommu mailing list
iommu@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/iommu


Re: [RFC] /dev/ioasid uAPI proposal

2021-06-02 Thread Alex Williamson
On Wed, 2 Jun 2021 14:35:10 -0300
Jason Gunthorpe  wrote:

> On Wed, Jun 02, 2021 at 11:11:17AM -0600, Alex Williamson wrote:
> 
> > > > > present and be able to test if DMA for that device is cache
> > > > > coherent.
> > > 
> > > Why is this such a strong linkage to VFIO and not just a 'hey kvm
> > > emulate wbinvd' flag from qemu?  
> > 
> > IIRC, wbinvd has host implications, a malicious user could tell KVM to
> > emulate wbinvd then run the op in a loop and induce a disproportionate
> > load on the system.  We therefore wanted a way that it would only be
> > enabled when required.  
> 
> I think the non-coherentness is vfio_device specific? eg a specific
> device will decide if it is coherent or not?

No, this is specifically whether DMA is cache coherent to the
processor, ie. in the case of wbinvd whether the processor needs to
invalidate its cache in order to see data from DMA.

> If yes I'd recast this to call kvm_arch_register_noncoherent_dma()
> from the VFIO_GROUP_NOTIFY_SET_KVM in the struct vfio_device
> implementation and not link it through the IOMMU.

The IOMMU tells us if DMA is cache coherent, VFIO_DMA_CC_IOMMU maps to
IOMMU_CAP_CACHE_COHERENCY for all domains within a container.

> If userspace is telling the vfio_device to be non-coherent or not then
> it can call kvm_arch_register_noncoherent_dma() or not based on that
> signal.

Not non-coherent device memory, that would be a driver issue, cache
coherence of DMA is what we're after.

> > > It kind of looks like the other main point is to generate the
> > > VFIO_GROUP_NOTIFY_SET_KVM which is being used by two VFIO drivers to
> > > connect back to the kvm data
> > > 
> > > But that seems like it would have been better handled with some IOCTL
> > > on the vfio_device fd to import the KVM to the driver not this
> > > roundabout way?  
> > 
> > Then QEMU would need to know which drivers require KVM knowledge?  This
> > allowed transparent backwards compatibility with userspace.  Thanks,  
> 
> I'd just blindly fire a generic 'hey here is your KVM FD' into every
> VFIO device.

Yes, QEMU could do this, but the vfio-kvm device was already there with
this association and required no uAPI work.  This one is the least IOMMU
related of the use cases.  Thanks,

Alex

___
iommu mailing list
iommu@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/iommu


Re: [RFC] /dev/ioasid uAPI proposal

2021-06-02 Thread Alex Williamson
On Wed, 2 Jun 2021 13:01:40 -0300
Jason Gunthorpe  wrote:

> On Wed, Jun 02, 2021 at 02:20:15AM +, Tian, Kevin wrote:
> > > From: Alex Williamson 
> > > Sent: Wednesday, June 2, 2021 6:22 AM
> > > 
> > > On Tue, 1 Jun 2021 07:01:57 +
> > > "Tian, Kevin"  wrote:  
> > > >
> > > > I summarized five opens here, about:
> > > >
> > > > 1)  Finalizing the name to replace /dev/ioasid;
> > > > 2)  Whether one device is allowed to bind to multiple IOASID fd's;
> > > > 3)  Carry device information in invalidation/fault reporting uAPI;
> > > > 4)  What should/could be specified when allocating an IOASID;
> > > > 5)  The protocol between vfio group and kvm;
> > > >  
> > > ...  
> > > >
> > > > For 5), I'd expect Alex to chime in. Per my understanding looks the
> > > > original purpose of this protocol is not about I/O address space. It's
> > > > for KVM to know whether any device is assigned to this VM and then
> > > > do something special (e.g. posted interrupt, EPT cache attribute, 
> > > > etc.).  
> > > 
> > > Right, the original use case was for KVM to determine whether it needs
> > > to emulate invlpg, so it needs to be aware when an assigned device is  
> > 
> > invlpg -> wbinvd :)

Oops, of course.
   
> > > present and be able to test if DMA for that device is cache
> > > coherent.  
> 
> Why is this such a strong linkage to VFIO and not just a 'hey kvm
> emulate wbinvd' flag from qemu?

IIRC, wbinvd has host implications, a malicious user could tell KVM to
emulate wbinvd then run the op in a loop and induce a disproportionate
load on the system.  We therefore wanted a way that it would only be
enabled when required.

> I briefly didn't see any obvios linkage in the arch code, just some
> dead code:
> 
> $ git grep iommu_noncoherent
> arch/x86/include/asm/kvm_host.h:  bool iommu_noncoherent;
> $ git grep iommu_domain arch/x86
> arch/x86/include/asm/kvm_host.h:struct iommu_domain *iommu_domain;
> 
> Huh?

Cruft from legacy KVM device assignment, I assume.  What you're looking
for is:

kvm_vfio_update_coherency
 kvm_arch_register_noncoherent_dma
  atomic_inc(&kvm->arch.noncoherent_dma_count);

need_emulate_wbinvd
 kvm_arch_has_noncoherent_dma
  atomic_read(&kvm->arch.noncoherent_dma_count);

There are a couple other callers that I'm not as familiar with.

> It kind of looks like the other main point is to generate the
> VFIO_GROUP_NOTIFY_SET_KVM which is being used by two VFIO drivers to
> connect back to the kvm data
> 
> But that seems like it would have been better handled with some IOCTL
> on the vfio_device fd to import the KVM to the driver not this
> roundabout way?

Then QEMU would need to know which drivers require KVM knowledge?  This
allowed transparent backwards compatibility with userspace.  Thanks,

Alex

___
iommu mailing list
iommu@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/iommu


Re: [RFC] /dev/ioasid uAPI proposal

2021-06-01 Thread Alex Williamson
On Tue, 1 Jun 2021 07:01:57 +
"Tian, Kevin"  wrote:
> 
> I summarized five opens here, about:
> 
> 1)  Finalizing the name to replace /dev/ioasid;
> 2)  Whether one device is allowed to bind to multiple IOASID fd's;
> 3)  Carry device information in invalidation/fault reporting uAPI;
> 4)  What should/could be specified when allocating an IOASID;
> 5)  The protocol between vfio group and kvm;
> 
...
> 
> For 5), I'd expect Alex to chime in. Per my understanding looks the
> original purpose of this protocol is not about I/O address space. It's
> for KVM to know whether any device is assigned to this VM and then
> do something special (e.g. posted interrupt, EPT cache attribute, etc.).

Right, the original use case was for KVM to determine whether it needs
to emulate invlpg, so it needs to be aware when an assigned device is
present and be able to test if DMA for that device is cache coherent.
The user, QEMU, creates a KVM "pseudo" device representing the vfio
group, providing the file descriptor of that group to show ownership.
The ugly symbol_get code is to avoid hard module dependencies, ie. the
kvm module should not pull in or require the vfio module, but vfio will
be present if attempting to register this device.

With kvmgt, the interface also became a way to register the kvm pointer
with vfio for the translation mentioned elsewhere in this thread.

The PPC/SPAPR support allows KVM to associate a vfio group to an IOMMU
page table so that it can handle iotlb programming from pre-registered
memory without trapping out to userspace.

> Because KVM deduces some policy based on the fact of assigned device, 
> it needs to hold a reference to related vfio group. this part is irrelevant
> to this RFC. 

All of these use cases are related to the IOMMU, whether DMA is
coherent, translating device IOVA to GPA, and an acceleration path to
emulate IOMMU programming in kernel... they seem pretty relevant.

> But ARM's VMID usage is related to I/O address space thus needs some
> consideration. Another strange thing is about PPC. Looks it also leverages
> this protocol to do iommu group attach: kvm_spapr_tce_attach_iommu_
> group. I don't know why it's done through KVM instead of VFIO uAPI in
> the first place.

AIUI, IOMMU programming on PPC is done through hypercalls, so KVM needs
to know how to handle those for in-kernel acceleration.  Thanks,

Alex

___
iommu mailing list
iommu@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/iommu


Re: [PATCH V4 05/18] iommu/ioasid: Redefine IOASID set and allocation APIs

2021-05-26 Thread Alex Williamson
On Wed, 26 May 2021 23:40:02 +0530
Kirti Wankhede  wrote:

> On 5/26/2021 4:22 AM, Alex Williamson wrote:
> > On Wed, 26 May 2021 00:56:30 +0530
> > Kirti Wankhede  wrote:
> >   
> >> On 5/25/2021 5:07 AM, Jason Gunthorpe wrote:  
> >>> On Mon, May 24, 2021 at 05:52:58PM +1000, David Gibson wrote:
> >>>  
> >>>>>> I don't really see a semantic distinction between "always one-device
> >>>>>> groups" and "groups don't matter".  Really the only way you can afford
> >>>>>> to not care about groups is if they're singletons.  
> >>>>>
> >>>>> The kernel driver under the mdev may not be in an "always one-device"
> >>>>> group.  
> >>>>
> >>>> I don't really understand what you mean by that.  
> >>>
> >>> I mean the group of the mdev's actual DMA device may have multiple
> >>> things in it.
> >>>
> >>>>> It is a kernel driver so the only thing we know and care about is that
> >>>>> all devices in the HW group are bound to kernel drivers.
> >>>>>
> >>>>> The vfio device that spawns from this kernel driver is really a
> >>>>> "groups don't matter" vfio device because at the IOMMU layer it should
> >>>>> be riding on the physical group of the kernel driver.  At the VFIO
> >>>>> layer we no longer care about the group abstraction because the system
> >>>>> guarentees isolation in some other way.  
> >>>>
> >>>> Uh.. I don't really know how mdevs are isolated from each other.  I
> >>>> thought it was because the physical device providing the mdevs
> >>>> effectively had an internal IOMMU (or at least DMA permissioning) to
> >>>> isolate the mdevs, even though the physical device may not be fully
> >>>> isolated.
> >>>>
> >>>> In that case the virtual mdev is effectively in a singleton group,
> >>>> which is different from the group of its parent device.  
> >>>  
> >>
> >> That's correct.
> >>  
> >>> That is one way to view it, but it means creating a whole group
> >>> infrastructure and abusing the IOMMU stack just to create this
> >>> nonsense fiction.  
> >>
> >> I really didn't get how this abuse the IOMMU stack.
> >> mdev can be used in 3 different ways:
> >> 1. non-iommu backed mdev devices where mdev vendor driver takes care to
> >> DMA map (iommu_map) and isolation is through device hardware internal
> >> MMU. Here vfio_iommu_type1 module provides a way to validate and pin
> >> pages required by mdev device for DMA mapping. Then IOMMU mapping is
> >> done by mdev vendor driver which is owner driver of physical device.
> >>
> >> 2. iommu backed mdev devices for SRIOV where mdev device is created per
> >> VF (mdev device == VF device) then that mdev device has same iommu
> >> protection scope as VF associated to it. Here mdev device is virtual
> >> device which uses features of mdev and represents underlying VF device,
> >> same as vfio-pci but with additional mdev features.  
> > 
> > What features would those be?  There are no mdev specific parts of the
> > vfio uAPI.
> > 
> > The mdev device is a virtual device, by why it it virtual in this case?
> > Aren't we effectively assigning the VF itself (mdev device == VF device)
> > with a bunch of extra support code to fill in the gaps of the VF
> > implementing the complete device model in hardware?
> > 
> > We're effectively creating this virtual device, creating a fake IOMMU
> > group, and trying to create this association of this virtual device to
> > the real VF in order to shoehorn it into the mdev model.  What do we
> > get from that model other than lifecycle management (ie. type selection)
> > and re-use of a bunch of code from the driver supporting the 1) model
> > above?
> >   
> 
> Yes, the lifecycle management which is in mdev is not in vfio-pci variant.
> 
> > This specific model seems better served by a device specific peer
> > driver to vfio-pci (ie. a "vfio-pci variant").  You effectively already
> > have the code for this driver, it's just in the format of an mdev
> > driver rather than a vfio "bus driver".  The work Jason references
> > relative to Max aims to make these kinds of drivers easier to implement
> > through re-use of vfio-pci code.
> > 
> > There are certainly other solutions we could come up with for selecting
> > a specific device type for a vfio-pci variant driver to implement other
> > than pretending this model actually belongs in mdev, right?  Thanks,
> >   
> 
> Sure and would like to see type selection mechanism to be implemented in 
> vfio-pci variant.


A driver provided sysfs attribute would obviously fill the short
term gap, long term maybe this would be standardized via netlink.  It
seems a bit analogous to setting the MAC address for a VF on an SR-IOV
NIC or VF namespace configuration for an SR-IOV NVMe device.  Thanks,

Alex

___
iommu mailing list
iommu@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/iommu


Re: [PATCH V4 05/18] iommu/ioasid: Redefine IOASID set and allocation APIs

2021-05-25 Thread Alex Williamson
On Wed, 26 May 2021 00:56:30 +0530
Kirti Wankhede  wrote:

> On 5/25/2021 5:07 AM, Jason Gunthorpe wrote:
> > On Mon, May 24, 2021 at 05:52:58PM +1000, David Gibson wrote:
> >   
>  I don't really see a semantic distinction between "always one-device
>  groups" and "groups don't matter".  Really the only way you can afford
>  to not care about groups is if they're singletons.  
> >>>
> >>> The kernel driver under the mdev may not be in an "always one-device"
> >>> group.  
> >>
> >> I don't really understand what you mean by that.  
> > 
> > I mean the group of the mdev's actual DMA device may have multiple
> > things in it.
> > 
> >>> It is a kernel driver so the only thing we know and care about is that
> >>> all devices in the HW group are bound to kernel drivers.
> >>>
> >>> The vfio device that spawns from this kernel driver is really a
> >>> "groups don't matter" vfio device because at the IOMMU layer it should
> >>> be riding on the physical group of the kernel driver.  At the VFIO
> >>> layer we no longer care about the group abstraction because the system
> >>> guarentees isolation in some other way.  
> >>
> >> Uh.. I don't really know how mdevs are isolated from each other.  I
> >> thought it was because the physical device providing the mdevs
> >> effectively had an internal IOMMU (or at least DMA permissioning) to
> >> isolate the mdevs, even though the physical device may not be fully
> >> isolated.
> >>
> >> In that case the virtual mdev is effectively in a singleton group,
> >> which is different from the group of its parent device.  
> >   
> 
> That's correct.
> 
> > That is one way to view it, but it means creating a whole group
> > infrastructure and abusing the IOMMU stack just to create this
> > nonsense fiction.  
> 
> I really didn't get how this abuse the IOMMU stack.
> mdev can be used in 3 different ways:
> 1. non-iommu backed mdev devices where mdev vendor driver takes care to
> DMA map (iommu_map) and isolation is through device hardware internal
> MMU. Here vfio_iommu_type1 module provides a way to validate and pin
> pages required by mdev device for DMA mapping. Then IOMMU mapping is
> done by mdev vendor driver which is owner driver of physical device.
> 
> 2. iommu backed mdev devices for SRIOV where mdev device is created per
> VF (mdev device == VF device) then that mdev device has same iommu
> protection scope as VF associated to it. Here mdev device is virtual
> device which uses features of mdev and represents underlying VF device,
> same as vfio-pci but with additional mdev features.

What features would those be?  There are no mdev specific parts of the
vfio uAPI.

The mdev device is a virtual device, by why it it virtual in this case?
Aren't we effectively assigning the VF itself (mdev device == VF device)
with a bunch of extra support code to fill in the gaps of the VF
implementing the complete device model in hardware?

We're effectively creating this virtual device, creating a fake IOMMU
group, and trying to create this association of this virtual device to
the real VF in order to shoehorn it into the mdev model.  What do we
get from that model other than lifecycle management (ie. type selection)
and re-use of a bunch of code from the driver supporting the 1) model
above?

This specific model seems better served by a device specific peer
driver to vfio-pci (ie. a "vfio-pci variant").  You effectively already
have the code for this driver, it's just in the format of an mdev
driver rather than a vfio "bus driver".  The work Jason references
relative to Max aims to make these kinds of drivers easier to implement
through re-use of vfio-pci code.

There are certainly other solutions we could come up with for selecting
a specific device type for a vfio-pci variant driver to implement other
than pretending this model actually belongs in mdev, right?  Thanks,

Alex

___
iommu mailing list
iommu@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/iommu


Re: [RFC PATCH v3 5/8] vfio/type1: VFIO_IOMMU_ENABLE_IOPF

2021-05-24 Thread Alex Williamson
On Fri, 21 May 2021 14:38:25 +0800
Shenming Lu  wrote:

> On 2021/5/19 2:58, Alex Williamson wrote:
> > On Fri, 9 Apr 2021 11:44:17 +0800
> > Shenming Lu  wrote:
> >   
> >> Since enabling IOPF for devices may lead to a slow ramp up of performance,
> >> we add an ioctl VFIO_IOMMU_ENABLE_IOPF to make it configurable. And the
> >> IOPF enabling of a VFIO device includes setting IOMMU_DEV_FEAT_IOPF and
> >> registering the VFIO IOPF handler.
> >>
> >> Note that VFIO_IOMMU_DISABLE_IOPF is not supported since there may be
> >> inflight page faults when disabling.
> >>
> >> Signed-off-by: Shenming Lu 
> >> ---
> >>  drivers/vfio/vfio_iommu_type1.c | 223 +++-
> >>  include/uapi/linux/vfio.h   |   6 +
> >>  2 files changed, 226 insertions(+), 3 deletions(-)
> >>
> >> diff --git a/drivers/vfio/vfio_iommu_type1.c 
> >> b/drivers/vfio/vfio_iommu_type1.c
> >> index 01e296c6dc9e..7df5711e743a 100644
> >> --- a/drivers/vfio/vfio_iommu_type1.c
> >> +++ b/drivers/vfio/vfio_iommu_type1.c
> >> @@ -71,6 +71,7 @@ struct vfio_iommu {
> >>struct rb_root  dma_list;
> >>struct blocking_notifier_head notifier;
> >>struct mmu_notifier mn;
> >> +  struct mm_struct*mm;  
> > 
> > We currently have no requirement that a single mm is used for all
> > container mappings.  Does enabling IOPF impose such a requirement?
> > Shouldn't MAP/UNMAP enforce that?  
> 
> Did you mean that there is a possibility that each vfio_dma in a
> container has a different mm? If so, could we register one MMU
> notifier for each vfio_dma in the MAP ioctl?

We don't prevent it currently.  There needs to be some balance,
typically we'll have one mm, so would it make sense to have potentially
thousands of mmu notifiers registered against the same mm?
 
> >>unsigned intdma_avail;
> >>unsigned intvaddr_invalid_count;
> >>uint64_tpgsize_bitmap;
> >> @@ -81,6 +82,7 @@ struct vfio_iommu {
> >>booldirty_page_tracking;
> >>boolpinned_page_dirty_scope;
> >>boolcontainer_open;
> >> +  booliopf_enabled;
> >>  };
> >>  
> >>  struct vfio_domain {
> >> @@ -461,6 +463,38 @@ vfio_find_iopf_group(struct iommu_group *iommu_group)
> >>return node ? iopf_group : NULL;
> >>  }
> >>  
> >> +static void vfio_link_iopf_group(struct vfio_iopf_group *new)
> >> +{
> >> +  struct rb_node **link, *parent = NULL;
> >> +  struct vfio_iopf_group *iopf_group;
> >> +
> >> +  mutex_lock(&iopf_group_list_lock);
> >> +
> >> +  link = &iopf_group_list.rb_node;
> >> +
> >> +  while (*link) {
> >> +  parent = *link;
> >> +  iopf_group = rb_entry(parent, struct vfio_iopf_group, node);
> >> +
> >> +  if (new->iommu_group < iopf_group->iommu_group)
> >> +  link = &(*link)->rb_left;
> >> +  else
> >> +  link = &(*link)->rb_right;
> >> +  }
> >> +
> >> +  rb_link_node(&new->node, parent, link);
> >> +  rb_insert_color(&new->node, &iopf_group_list);
> >> +
> >> +  mutex_unlock(&iopf_group_list_lock);
> >> +}
> >> +
> >> +static void vfio_unlink_iopf_group(struct vfio_iopf_group *old)
> >> +{
> >> +  mutex_lock(&iopf_group_list_lock);
> >> +  rb_erase(&old->node, &iopf_group_list);
> >> +  mutex_unlock(&iopf_group_list_lock);
> >> +}
> >> +
> >>  static int vfio_lock_acct(struct vfio_dma *dma, long npage, bool async)
> >>  {
> >>struct mm_struct *mm;
> >> @@ -2363,6 +2397,68 @@ static void vfio_iommu_iova_insert_copy(struct 
> >> vfio_iommu *iommu,
> >>list_splice_tail(iova_copy, iova);
> >>  }
> >>  
> >> +static int vfio_dev_domian_nested(struct device *dev, int *nested)  
> > 
> > 
> > s/domian/domain/
> > 
> >   
> >> +{
> >> +  struct iommu_domain *domain;
> >> +
> >> +  domain = iommu_get_domain_for_dev(dev);
> >> +  if (!domain)
> >> +  return -ENODEV;
> >> +
> >> +  return iommu_domain_get_attr(domain, DOMAIN_ATTR_NESTING, nested);  
> > 
&

Re: [RFC PATCH v3 0/8] Add IOPF support for VFIO passthrough

2021-05-24 Thread Alex Williamson
On Fri, 21 May 2021 14:37:21 +0800
Shenming Lu  wrote:

> Hi Alex,
> 
> On 2021/5/19 2:57, Alex Williamson wrote:
> > On Fri, 9 Apr 2021 11:44:12 +0800
> > Shenming Lu  wrote:
> >   
> >> Hi,
> >>
> >> Requesting for your comments and suggestions. :-)
> >>
> >> The static pinning and mapping problem in VFIO and possible solutions
> >> have been discussed a lot [1, 2]. One of the solutions is to add I/O
> >> Page Fault support for VFIO devices. Different from those relatively
> >> complicated software approaches such as presenting a vIOMMU that provides
> >> the DMA buffer information (might include para-virtualized optimizations),
> >> IOPF mainly depends on the hardware faulting capability, such as the PCIe
> >> PRI extension or Arm SMMU stall model. What's more, the IOPF support in
> >> the IOMMU driver has already been implemented in SVA [3]. So we add IOPF
> >> support for VFIO passthrough based on the IOPF part of SVA in this series. 
> >>  
> > 
> > The SVA proposals are being reworked to make use of a new IOASID
> > object, it's not clear to me that this shouldn't also make use of that
> > work as it does a significant expansion of the type1 IOMMU with fault
> > handlers that would duplicate new work using that new model.  
> 
> It seems that the IOASID extension for guest SVA would not affect this series,
> will we do any host-guest IOASID translation in the VFIO fault handler?

Surely it will, we don't currently have any IOMMU fault handling or
forwarding of IOMMU faults through to the vfio bus driver, both of
those would be included in an IOASID implementation.  I think Jason's
vision is to use IOASID to deprecate type1 for all use cases, so even
if we were to choose to implement IOPF in type1 we should agree on
common interfaces with IOASID.
 
> >> We have measured its performance with UADK [4] (passthrough an accelerator
> >> to a VM(1U16G)) on Hisilicon Kunpeng920 board (and compared with host SVA):
> >>
> >> Run hisi_sec_test...
> >>  - with varying sending times and message lengths
> >>  - with/without IOPF enabled (speed slowdown)
> >>
> >> when msg_len = 1MB (and PREMAP_LEN (in Patch 4) = 1):
> >> slowdown (num of faults)
> >>  times  VFIO IOPF  host SVA
> >>  1  63.4% (518)82.8% (512)
> >>  10022.9% (1058)   47.9% (1024)
> >>  1000   2.6% (1071)8.5% (1024)
> >>
> >> when msg_len = 10MB (and PREMAP_LEN = 512):
> >> slowdown (num of faults)
> >>  times  VFIO IOPF
> >>  1  32.6% (13)
> >>  1003.5% (26)
> >>  1000   1.6% (26)  
> > 
> > It seems like this is only an example that you can make a benchmark
> > show anything you want.  The best results would be to pre-map
> > everything, which is what we have without this series.  What is an
> > acceptable overhead to incur to avoid page pinning?  What if userspace
> > had more fine grained control over which mappings were available for
> > faulting and which were statically mapped?  I don't really see what
> > sense the pre-mapping range makes.  If I assume the user is QEMU in a
> > non-vIOMMU configuration, pre-mapping the beginning of each RAM section
> > doesn't make any logical sense relative to device DMA.  
> 
> As you said in Patch 4, we can introduce full end-to-end functionality
> before trying to improve performance, and I will drop the pre-mapping patch
> in the current stage...
> 
> Is there a need that userspace wants more fine grained control over which
> mappings are available for faulting? If so, we may evolve the MAP ioctl
> to support for specifying the faulting range.

You're essentially enabling this for a vfio bus driver via patch 7/8,
pinning for selective DMA faulting.  How would a driver in userspace
make equivalent requests?  In the case of performance, the user could
mlock the page but they have no mechanism here to pre-fault it.  Should
they?

> As for the overhead of IOPF, it is unavoidable if enabling on-demand paging
> (and page faults occur almost only when first accessing), and it seems that
> there is a large optimization space compared to CPU page faulting.

Yes, there's of course going to be overhead in terms of latency for the
page faults.  My point was more that when a host is not under memory
pressure we should trend towards the performance of pinned, static
mappings and we should be able to create arbitrarily large pre-fault
behavior to show that.  But I think what we really want to enable via
IOPF is density, right?  Therefore how many more assigned device guests
can you run on a host with IOPF?  How does the slope, plateau, and
inflection point of their aggregate throughput compare to static
pinning?  VM startup time is probably also a useful metric, ie. trading
device latency for startup latency.  Thanks,

Alex

___
iommu mailing list
iommu@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/iommu


Re: [RFC PATCH v3 2/8] vfio/type1: Add a page fault handler

2021-05-24 Thread Alex Williamson
On Fri, 21 May 2021 14:38:52 +0800
Shenming Lu  wrote:

> On 2021/5/19 2:58, Alex Williamson wrote:
> > On Fri, 9 Apr 2021 11:44:14 +0800
> > Shenming Lu  wrote:
> >   
> >> VFIO manages the DMA mapping itself. To support IOPF (on-demand paging)
> >> for VFIO (IOMMU capable) devices, we add a VFIO page fault handler to
> >> serve the reported page faults from the IOMMU driver.
> >>
> >> Signed-off-by: Shenming Lu 
> >> ---
> >>  drivers/vfio/vfio_iommu_type1.c | 114 
> >>  1 file changed, 114 insertions(+)
> >>
> >> diff --git a/drivers/vfio/vfio_iommu_type1.c 
> >> b/drivers/vfio/vfio_iommu_type1.c
> >> index 45cbfd4879a5..ab0ff60ee207 100644
> >> --- a/drivers/vfio/vfio_iommu_type1.c
> >> +++ b/drivers/vfio/vfio_iommu_type1.c
> >> @@ -101,6 +101,7 @@ struct vfio_dma {
> >>struct task_struct  *task;
> >>struct rb_root  pfn_list;   /* Ex-user pinned pfn list */
> >>unsigned long   *bitmap;
> >> +  unsigned long   *iopf_mapped_bitmap;
> >>  };
> >>  
> >>  struct vfio_batch {
> >> @@ -141,6 +142,16 @@ struct vfio_regions {
> >>size_t len;
> >>  };
> >>  
> >> +/* A global IOPF enabled group list */
> >> +static struct rb_root iopf_group_list = RB_ROOT;
> >> +static DEFINE_MUTEX(iopf_group_list_lock);
> >> +
> >> +struct vfio_iopf_group {
> >> +  struct rb_node  node;
> >> +  struct iommu_group  *iommu_group;
> >> +  struct vfio_iommu   *iommu;
> >> +};
> >> +
> >>  #define IS_IOMMU_CAP_DOMAIN_IN_CONTAINER(iommu)   \
> >>(!list_empty(&iommu->domain_list))
> >>  
> >> @@ -157,6 +168,10 @@ struct vfio_regions {
> >>  #define DIRTY_BITMAP_PAGES_MAX ((u64)INT_MAX)
> >>  #define DIRTY_BITMAP_SIZE_MAX  
> >> DIRTY_BITMAP_BYTES(DIRTY_BITMAP_PAGES_MAX)
> >>  
> >> +#define IOPF_MAPPED_BITMAP_GET(dma, i)\
> >> +((dma->iopf_mapped_bitmap[(i) / BITS_PER_LONG]
> >> \
> >> + >> ((i) % BITS_PER_LONG)) & 0x1)  
> > 
> > 
> > Can't we just use test_bit()?  
> 
> Yeah, we can use it.
> 
> > 
> >   
> >> +
> >>  #define WAITED 1
> >>  
> >>  static int put_pfn(unsigned long pfn, int prot);
> >> @@ -416,6 +431,34 @@ static int vfio_iova_put_vfio_pfn(struct vfio_dma 
> >> *dma, struct vfio_pfn *vpfn)
> >>return ret;
> >>  }
> >>  
> >> +/*
> >> + * Helper functions for iopf_group_list
> >> + */
> >> +static struct vfio_iopf_group *
> >> +vfio_find_iopf_group(struct iommu_group *iommu_group)
> >> +{
> >> +  struct vfio_iopf_group *iopf_group;
> >> +  struct rb_node *node;
> >> +
> >> +  mutex_lock(&iopf_group_list_lock);
> >> +
> >> +  node = iopf_group_list.rb_node;
> >> +
> >> +  while (node) {
> >> +  iopf_group = rb_entry(node, struct vfio_iopf_group, node);
> >> +
> >> +  if (iommu_group < iopf_group->iommu_group)
> >> +  node = node->rb_left;
> >> +  else if (iommu_group > iopf_group->iommu_group)
> >> +  node = node->rb_right;
> >> +  else
> >> +  break;
> >> +  }
> >> +
> >> +  mutex_unlock(&iopf_group_list_lock);
> >> +  return node ? iopf_group : NULL;
> >> +}  
> > 
> > This looks like a pretty heavy weight operation per DMA fault.
> > 
> > I'm also suspicious of this validity of this iopf_group after we've
> > dropped the locking, the ordering of patches makes this very confusing.  
> 
> My thought was to include the handling of DMA faults completely in the type1
> backend by introducing the vfio_iopf_group struct. But it seems that 
> introducing
> a struct with an unknown lifecycle causes more problems...
> I will use the path from vfio-core as in the v2 for simplicity and validity.
> 
> Sorry for the confusing, I will reconstruct the series later. :-)
> 
> >   
> >> +
> >>  static int vfio_lock_acct(struct vfio_dma *dma, long npage, bool async)
> >>  {
> >>struct mm_struct *mm;
> >> @@ -3106,6 +3149,77 @@ static int vfio_iommu_type1_dirty_pages(struct 
> >

Re: [RFC PATCH v3 8/8] vfio: Add nested IOPF support

2021-05-24 Thread Alex Williamson
On Mon, 24 May 2021 21:11:11 +0800
Shenming Lu  wrote:

> On 2021/5/21 15:59, Shenming Lu wrote:
> > On 2021/5/19 2:58, Alex Williamson wrote:  
> >> On Fri, 9 Apr 2021 11:44:20 +0800
> >> Shenming Lu  wrote:
> >>  
> >>> To set up nested mode, drivers such as vfio_pci need to register a
> >>> handler to receive stage/level 1 faults from the IOMMU, but since
> >>> currently each device can only have one iommu dev fault handler,
> >>> and if stage 2 IOPF is already enabled (VFIO_IOMMU_ENABLE_IOPF),
> >>> we choose to update the registered handler (a consolidated one) via
> >>> flags (set FAULT_REPORT_NESTED_L1), and further deliver the received
> >>> stage 1 faults in the handler to the guest through a newly added
> >>> vfio_device_ops callback.  
> >>
> >> Are there proposed in-kernel drivers that would use any of these
> >> symbols?  
> > 
> > I hope that such as Eric's SMMUv3 Nested Stage Setup series [1] can
> > use these symbols to consolidate the two page fault handlers into one.
> > 
> > [1] 
> > https://patchwork.kernel.org/project/kvm/cover/2021044659.15051-1-eric.au...@redhat.com/
> >   
> >>  
> >>> Signed-off-by: Shenming Lu 
> >>> ---
> >>>  drivers/vfio/vfio.c | 81 +
> >>>  drivers/vfio/vfio_iommu_type1.c | 49 +++-
> >>>  include/linux/vfio.h| 12 +
> >>>  3 files changed, 141 insertions(+), 1 deletion(-)
> >>>
> >>> diff --git a/drivers/vfio/vfio.c b/drivers/vfio/vfio.c
> >>> index 44c8dfabf7de..4245f15914bf 100644
> >>> --- a/drivers/vfio/vfio.c
> >>> +++ b/drivers/vfio/vfio.c
> >>> @@ -2356,6 +2356,87 @@ struct iommu_domain 
> >>> *vfio_group_iommu_domain(struct vfio_group *group)
> >>>  }
> >>>  EXPORT_SYMBOL_GPL(vfio_group_iommu_domain);
> >>>  
> >>> +/*
> >>> + * Register/Update the VFIO IOPF handler to receive
> >>> + * nested stage/level 1 faults.
> >>> + */
> >>> +int vfio_iommu_dev_fault_handler_register_nested(struct device *dev)
> >>> +{
> >>> + struct vfio_container *container;
> >>> + struct vfio_group *group;
> >>> + struct vfio_iommu_driver *driver;
> >>> + int ret;
> >>> +
> >>> + if (!dev)
> >>> + return -EINVAL;
> >>> +
> >>> + group = vfio_group_get_from_dev(dev);
> >>> + if (!group)
> >>> + return -ENODEV;
> >>> +
> >>> + ret = vfio_group_add_container_user(group);
> >>> + if (ret)
> >>> + goto out;
> >>> +
> >>> + container = group->container;
> >>> + driver = container->iommu_driver;
> >>> + if (likely(driver && driver->ops->register_handler))
> >>> + ret = driver->ops->register_handler(container->iommu_data, dev);
> >>> + else
> >>> + ret = -ENOTTY;
> >>> +
> >>> + vfio_group_try_dissolve_container(group);
> >>> +
> >>> +out:
> >>> + vfio_group_put(group);
> >>> + return ret;
> >>> +}
> >>> +EXPORT_SYMBOL_GPL(vfio_iommu_dev_fault_handler_register_nested);
> >>> +
> >>> +int vfio_iommu_dev_fault_handler_unregister_nested(struct device *dev)
> >>> +{
> >>> + struct vfio_container *container;
> >>> + struct vfio_group *group;
> >>> + struct vfio_iommu_driver *driver;
> >>> + int ret;
> >>> +
> >>> + if (!dev)
> >>> + return -EINVAL;
> >>> +
> >>> + group = vfio_group_get_from_dev(dev);
> >>> + if (!group)
> >>> + return -ENODEV;
> >>> +
> >>> + ret = vfio_group_add_container_user(group);
> >>> + if (ret)
> >>> + goto out;
> >>> +
> >>> + container = group->container;
> >>> + driver = container->iommu_driver;
> >>> + if (likely(driver && driver->ops->unregister_handler))
> >>> + ret = driver->ops->unregister_handler(container->iommu_data, 
> >>> dev);
> >>> + else
> >>> + ret = -ENOTTY;
> >>> +
> >>> + vfio_group_try_dissolve_container(group);
> >>> +
> >>> +out:
> >>

  1   2   3   4   5   6   7   8   9   10   >