On Fri, 11 May 2018 20:06:06 +0100
Jean-Philippe Brucker <[email protected]> wrote:

> When creating an io_mm structure, register an MMU notifier that informs
> us when the virtual address space changes and disappears.
> 
> Add a new operation to the IOMMU driver, mm_invalidate, called when a
> range of addresses is unmapped to let the IOMMU driver send ATC
> invalidations. mm_invalidate cannot sleep.
> 
> Adding the notifier complicates io_mm release. In one case device
> drivers free the io_mm explicitly by calling unbind (or detaching the
> device from its domain). In the other case the process could crash
> before unbind, in which case the release notifier has to do all the
> work.
> 
> Allowing the device driver's mm_exit() handler to sleep adds another
> complication, but it will greatly simplify things for users. For example
> VFIO can take the IOMMU mutex and remove any trace of io_mm, instead of
> introducing complex synchronization to delicatly handle this race. But
> relaxing the user side does force unbind() to sleep and wait for all
> pending mm_exit() calls to finish.
> 
> Signed-off-by: Jean-Philippe Brucker <[email protected]>
> 
> ---
> v1->v2:
> * Unbind() waits for mm_exit to finish
> * mm_exit can sleep
> ---
>  drivers/iommu/Kconfig     |   1 +
>  drivers/iommu/iommu-sva.c | 248 +++++++++++++++++++++++++++++++++++---
>  include/linux/iommu.h     |  10 ++
>  3 files changed, 244 insertions(+), 15 deletions(-)
> 
> diff --git a/drivers/iommu/Kconfig b/drivers/iommu/Kconfig
> index cca8e06903c7..38434899e283 100644
> --- a/drivers/iommu/Kconfig
> +++ b/drivers/iommu/Kconfig
> @@ -77,6 +77,7 @@ config IOMMU_DMA
>  config IOMMU_SVA
>       bool
>       select IOMMU_API
> +     select MMU_NOTIFIER
>  
>  config FSL_PAMU
>       bool "Freescale IOMMU support"
> diff --git a/drivers/iommu/iommu-sva.c b/drivers/iommu/iommu-sva.c
> index 0700893c679d..e9afae2537a2 100644
> --- a/drivers/iommu/iommu-sva.c
> +++ b/drivers/iommu/iommu-sva.c
> @@ -7,6 +7,7 @@
>  
>  #include <linux/idr.h>
>  #include <linux/iommu.h>
> +#include <linux/mmu_notifier.h>
>  #include <linux/sched/mm.h>
>  #include <linux/slab.h>
>  #include <linux/spinlock.h>
> @@ -106,6 +107,9 @@ struct iommu_bond {
>       struct list_head        mm_head;
>       struct list_head        dev_head;
>       struct list_head        domain_head;
> +     refcount_t              refs;
> +     struct wait_queue_head  mm_exit_wq;
> +     bool                    mm_exit_active;
>  
>       void                    *drvdata;
>  };
> @@ -124,6 +128,8 @@ static DEFINE_IDR(iommu_pasid_idr);
>   */
>  static DEFINE_SPINLOCK(iommu_sva_lock);
>  
> +static struct mmu_notifier_ops iommu_mmu_notifier;
> +
>  static struct io_mm *
>  io_mm_alloc(struct iommu_domain *domain, struct device *dev,
>           struct mm_struct *mm, unsigned long flags)
> @@ -151,6 +157,7 @@ io_mm_alloc(struct iommu_domain *domain, struct device 
> *dev,
>  
>       io_mm->flags            = flags;
>       io_mm->mm               = mm;
> +     io_mm->notifier.ops     = &iommu_mmu_notifier;
>       io_mm->release          = domain->ops->mm_free;
>       INIT_LIST_HEAD(&io_mm->devices);
>  
> @@ -167,8 +174,29 @@ io_mm_alloc(struct iommu_domain *domain, struct device 
> *dev,
>               goto err_free_mm;
>       }
>  
> -     /* TODO: keep track of mm. For the moment, abort. */
> -     ret = -ENOSYS;
> +     ret = mmu_notifier_register(&io_mm->notifier, mm);
> +     if (ret)
> +             goto err_free_pasid;
> +
> +     /*
> +      * Now that the MMU notifier is valid, we can allow users to grab this
> +      * io_mm by setting a valid refcount. Before that it was accessible in
> +      * the IDR but invalid.
> +      *
> +      * The following barrier ensures that users, who obtain the io_mm with
> +      * kref_get_unless_zero, don't read uninitialized fields in the
> +      * structure.
> +      */
> +     smp_wmb();
> +     kref_init(&io_mm->kref);
> +
> +     return io_mm;
> +
> +err_free_pasid:
> +     /*
> +      * Even if the io_mm is accessible from the IDR at this point, kref is
> +      * 0 so no user could get a reference to it. Free it manually.
> +      */
>       spin_lock(&iommu_sva_lock);
>       idr_remove(&iommu_pasid_idr, io_mm->pasid);
>       spin_unlock(&iommu_sva_lock);
> @@ -180,9 +208,13 @@ io_mm_alloc(struct iommu_domain *domain, struct device 
> *dev,
>       return ERR_PTR(ret);
>  }
>  
> -static void io_mm_free(struct io_mm *io_mm)
> +static void io_mm_free(struct rcu_head *rcu)
>  {
> -     struct mm_struct *mm = io_mm->mm;
> +     struct io_mm *io_mm;
> +     struct mm_struct *mm;
> +
> +     io_mm = container_of(rcu, struct io_mm, rcu);
> +     mm = io_mm->mm;
>  
>       io_mm->release(io_mm);
>       mmdrop(mm);
> @@ -197,7 +229,22 @@ static void io_mm_release(struct kref *kref)
>  
>       idr_remove(&iommu_pasid_idr, io_mm->pasid);
>  
> -     io_mm_free(io_mm);
> +     /*
> +      * If we're being released from mm exit, the notifier callback ->release
> +      * has already been called. Otherwise we don't need ->release, the io_mm
> +      * isn't attached to anything anymore. Hence no_release.
> +      */
> +     mmu_notifier_unregister_no_release(&io_mm->notifier, io_mm->mm);
> +
> +     /*
> +      * We can't free the structure here, because if mm exits during
> +      * unbind(), then ->release might be attempting to grab the io_mm
> +      * concurrently. And in the other case, if ->release is calling
> +      * io_mm_release, then __mmu_notifier_release expects to still have a
> +      * valid mn when returning. So free the structure when it's safe, after
> +      * the RCU grace period elapsed.
> +      */
> +     mmu_notifier_call_srcu(&io_mm->rcu, io_mm_free);
>  }
>  
>  /*
> @@ -206,8 +253,14 @@ static void io_mm_release(struct kref *kref)
>   */
>  static int io_mm_get_locked(struct io_mm *io_mm)
>  {
> -     if (io_mm)
> -             return kref_get_unless_zero(&io_mm->kref);
> +     if (io_mm && kref_get_unless_zero(&io_mm->kref)) {
> +             /*
> +              * kref_get_unless_zero doesn't provide ordering for reads. This
> +              * barrier pairs with the one in io_mm_alloc.
> +              */
> +             smp_rmb();
> +             return 1;
> +     }
>  
>       return 0;
>  }
> @@ -233,7 +286,8 @@ static int io_mm_attach(struct iommu_domain *domain, 
> struct device *dev,
>       struct iommu_bond *bond, *tmp;
>       struct iommu_sva_param *param = dev->iommu_param->sva_param;
>  
> -     if (!domain->ops->mm_attach || !domain->ops->mm_detach)
> +     if (!domain->ops->mm_attach || !domain->ops->mm_detach ||
> +         !domain->ops->mm_invalidate)
>               return -ENODEV;
>  
>       if (pasid > param->max_pasid || pasid < param->min_pasid)
> @@ -247,6 +301,8 @@ static int io_mm_attach(struct iommu_domain *domain, 
> struct device *dev,
>       bond->io_mm             = io_mm;
>       bond->dev               = dev;
>       bond->drvdata           = drvdata;
> +     refcount_set(&bond->refs, 1);
> +     init_waitqueue_head(&bond->mm_exit_wq);
>  
>       spin_lock(&iommu_sva_lock);
>       /*
> @@ -275,12 +331,37 @@ static int io_mm_attach(struct iommu_domain *domain, 
> struct device *dev,
>       return 0;
>  }
>  
> -static void io_mm_detach_locked(struct iommu_bond *bond)
> +static void io_mm_detach_locked(struct iommu_bond *bond, bool wait)
>  {
>       struct iommu_bond *tmp;
>       bool detach_domain = true;
>       struct iommu_domain *domain = bond->domain;
>  
> +     if (wait) {
> +             bool do_detach = true;
> +             /*
> +              * If we're unbind() then we're deleting the bond no matter
> +              * what. Tell the mm_exit thread that we're cleaning up, and
> +              * wait until it finishes using the bond.
> +              *
> +              * refs is guaranteed to be one or more, otherwise it would
> +              * already have been removed from the list. Check is someone is

Check if someone...

> +              * already waiting, in which case we wait but do not free.
> +              */
> +             if (refcount_read(&bond->refs) > 1)
> +                     do_detach = false;
> +
> +             refcount_inc(&bond->refs);
> +             wait_event_lock_irq(bond->mm_exit_wq, !bond->mm_exit_active,
> +                                 iommu_sva_lock);
> +             if (!do_detach)
> +                     return;
> +
> +     } else if (!refcount_dec_and_test(&bond->refs)) {
> +             /* unbind() is waiting to free the bond */
> +             return;
> +     }
> +
>       list_for_each_entry(tmp, &domain->mm_list, domain_head) {
>               if (tmp->io_mm == bond->io_mm && tmp->dev != bond->dev) {
>                       detach_domain = false;
> @@ -298,6 +379,129 @@ static void io_mm_detach_locked(struct iommu_bond *bond)
>       kfree(bond);
>  }
>  
> +static int iommu_signal_mm_exit(struct iommu_bond *bond)
> +{
> +     struct device *dev = bond->dev;
> +     struct io_mm *io_mm = bond->io_mm;
> +     struct iommu_sva_param *param = dev->iommu_param->sva_param;
> +
> +     /*
> +      * We can't hold the device's param_lock. If we did and the device
> +      * driver used a global lock around io_mm, we would risk getting the
> +      * following deadlock:
> +      *
> +      *   exit_mm()                 |  Shutdown SVA
> +      *    mutex_lock(param->lock)  |   mutex_lock(glob lock)
> +      *     param->mm_exit()        |    sva_device_shutdown()
> +      *      mutex_lock(glob lock)  |     mutex_lock(param->lock)
> +      *
> +      * Fortunately unbind() waits for us to finish, and sva_device_shutdown
> +      * requires that any bond is removed, so we can safely access mm_exit
> +      * and drvdata without taking any lock.
> +      */
> +     if (!param || !param->mm_exit)
> +             return 0;
> +
> +     return param->mm_exit(dev, io_mm->pasid, bond->drvdata);
> +}
> +
> +/* Called when the mm exits. Can race with unbind(). */
> +static void iommu_notifier_release(struct mmu_notifier *mn, struct mm_struct 
> *mm)
> +{
> +     struct iommu_bond *bond, *next;
> +     struct io_mm *io_mm = container_of(mn, struct io_mm, notifier);
> +
> +     /*
> +      * If the mm is exiting then devices are still bound to the io_mm.
> +      * A few things need to be done before it is safe to release:
> +      *
> +      * - As the mmu notifier doesn't hold any reference to the io_mm when
> +      *   calling ->release(), try to take a reference.
> +      * - Tell the device driver to stop using this PASID.
> +      * - Clear the PASID table and invalidate TLBs.
> +      * - Drop all references to this io_mm by freeing the bonds.
> +      */
> +     spin_lock(&iommu_sva_lock);
> +     if (!io_mm_get_locked(io_mm)) {
> +             /* Someone's already taking care of it. */
> +             spin_unlock(&iommu_sva_lock);
> +             return;
> +     }
> +
> +     list_for_each_entry_safe(bond, next, &io_mm->devices, mm_head) {
> +             /*
> +              * Release the lock to let the handler sleep. We need to be
> +              * careful about concurrent modifications to the list and to the
> +              * bond. Tell unbind() not to free the bond until we're done.
> +              */
> +             bond->mm_exit_active = true;
> +             spin_unlock(&iommu_sva_lock);
> +
> +             if (iommu_signal_mm_exit(bond))
> +                     dev_WARN(bond->dev, "possible leak of PASID %u",
> +                              io_mm->pasid);
> +
> +             spin_lock(&iommu_sva_lock);
> +             next = list_next_entry(bond, mm_head);
> +
> +             /* If someone is waiting, let them delete the bond now */
> +             bond->mm_exit_active = false;
> +             wake_up_all(&bond->mm_exit_wq);
> +
> +             /* Otherwise, do it ourselves */
> +             io_mm_detach_locked(bond, false);
> +     }
> +     spin_unlock(&iommu_sva_lock);
> +
> +     /*
> +      * We're now reasonably certain that no more fault is being handled for
> +      * this io_mm, since we just flushed them all out of the fault queue.
> +      * Release the last reference to free the io_mm.
> +      */
> +     io_mm_put(io_mm);
> +}
> +
> +static void iommu_notifier_invalidate_range(struct mmu_notifier *mn,
> +                                         struct mm_struct *mm,
> +                                         unsigned long start,
> +                                         unsigned long end)
> +{
> +     struct iommu_bond *bond;
> +     struct io_mm *io_mm = container_of(mn, struct io_mm, notifier);
> +
> +     spin_lock(&iommu_sva_lock);
> +     list_for_each_entry(bond, &io_mm->devices, mm_head) {
> +             struct iommu_domain *domain = bond->domain;
> +
> +             domain->ops->mm_invalidate(domain, bond->dev, io_mm, start,
> +                                        end - start);
> +     }
> +     spin_unlock(&iommu_sva_lock);
> +}
> +
> +static int iommu_notifier_clear_flush_young(struct mmu_notifier *mn,
> +                                         struct mm_struct *mm,
> +                                         unsigned long start,
> +                                         unsigned long end)
> +{
> +     iommu_notifier_invalidate_range(mn, mm, start, end);
> +     return 0;
> +}
> +
> +static void iommu_notifier_change_pte(struct mmu_notifier *mn,
> +                                   struct mm_struct *mm,
> +                                   unsigned long address, pte_t pte)
> +{
> +     iommu_notifier_invalidate_range(mn, mm, address, address + PAGE_SIZE);
> +}
> +
> +static struct mmu_notifier_ops iommu_mmu_notifier = {
> +     .release                = iommu_notifier_release,
> +     .clear_flush_young      = iommu_notifier_clear_flush_young,
> +     .change_pte             = iommu_notifier_change_pte,
> +     .invalidate_range       = iommu_notifier_invalidate_range,
> +};
> +
>  /**
>   * iommu_sva_device_init() - Initialize Shared Virtual Addressing for a 
> device
>   * @dev: the device
> @@ -320,6 +524,12 @@ static void io_mm_detach_locked(struct iommu_bond *bond)
>   * The handler gets an opaque pointer corresponding to the drvdata passed as
>   * argument of bind().
>   *
> + * The @mm_exit handler is allowed to sleep. Be careful about the locks 
> taken in
> + * @mm_exit, because they might lead to deadlocks if they are also held when
> + * dropping references to the mm. Consider the following call chain:
> + *   mutex_lock(A); mmput(mm) -> exit_mm() -> @mm_exit() -> mutex_lock(A)
> + * Using mmput_async() prevents this scenario.
> + *
>   * The device should not be performing any DMA while this function is 
> running,
>   * otherwise the behavior is undefined.
>   *
> @@ -484,15 +694,16 @@ int __iommu_sva_unbind_device(struct device *dev, int 
> pasid)
>       if (!param || WARN_ON(!domain))
>               return -EINVAL;
>  
> -     spin_lock(&iommu_sva_lock);
> +     /* spin_lock_irq matches the one in wait_event_lock_irq */
> +     spin_lock_irq(&iommu_sva_lock);
>       list_for_each_entry(bond, &param->mm_list, dev_head) {
>               if (bond->io_mm->pasid == pasid) {
> -                     io_mm_detach_locked(bond);
> +                     io_mm_detach_locked(bond, true);
>                       ret = 0;
>                       break;
>               }
>       }
> -     spin_unlock(&iommu_sva_lock);
> +     spin_unlock_irq(&iommu_sva_lock);
>  
>       return ret;
>  }
> @@ -503,18 +714,25 @@ EXPORT_SYMBOL_GPL(__iommu_sva_unbind_device);
>   * @dev: the device
>   *
>   * When detaching @device from a domain, IOMMU drivers should use this 
> helper.
> + * This function may sleep while waiting for bonds to be released.
>   */
>  void __iommu_sva_unbind_dev_all(struct device *dev)
>  {
>       struct iommu_sva_param *param;
>       struct iommu_bond *bond, *next;
>  
> +     /*
> +      * io_mm_detach_locked might wait, so we shouldn't call it with the dev
> +      * param lock held. It's fine to read sva_param outside the lock because
> +      * it can only be freed by iommu_sva_device_shutdown when there are no
> +      * more bonds in the list.
> +      */
>       param = dev->iommu_param->sva_param;
>       if (param) {
> -             spin_lock(&iommu_sva_lock);
> +             spin_lock_irq(&iommu_sva_lock);
>               list_for_each_entry_safe(bond, next, &param->mm_list, dev_head)
> -                     io_mm_detach_locked(bond);
> -             spin_unlock(&iommu_sva_lock);
> +                     io_mm_detach_locked(bond, true);
> +             spin_unlock_irq(&iommu_sva_lock);
>       }
>  }
>  EXPORT_SYMBOL_GPL(__iommu_sva_unbind_dev_all);
> diff --git a/include/linux/iommu.h b/include/linux/iommu.h
> index 439c8fffd836..caa6f79785b9 100644
> --- a/include/linux/iommu.h
> +++ b/include/linux/iommu.h
> @@ -24,6 +24,7 @@
>  #include <linux/types.h>
>  #include <linux/errno.h>
>  #include <linux/err.h>
> +#include <linux/mmu_notifier.h>
>  #include <linux/of.h>
>  #include <uapi/linux/iommu.h>
>  
> @@ -111,10 +112,15 @@ struct io_mm {
>       unsigned long           flags;
>       struct list_head        devices;
>       struct kref             kref;
> +#if defined(CONFIG_MMU_NOTIFIER)
> +     struct mmu_notifier     notifier;
> +#endif
>       struct mm_struct        *mm;
>  
>       /* Release callback for this mm */
>       void (*release)(struct io_mm *io_mm);
> +     /* For postponed release */
> +     struct rcu_head         rcu;
>  };
>  
>  enum iommu_cap {
> @@ -249,6 +255,7 @@ struct iommu_sva_param {
>   * @mm_attach: attach io_mm to a device. Install PASID entry if necessary
>   * @mm_detach: detach io_mm from a device. Remove PASID entry and
>   *             flush associated TLB entries.
> + * @mm_invalidate: Invalidate a range of mappings for an mm
>   * @map: map a physically contiguous memory region to an iommu domain
>   * @unmap: unmap a physically contiguous memory region from an iommu domain
>   * @map_sg: map a scatter-gather list of physically contiguous memory chunks
> @@ -298,6 +305,9 @@ struct iommu_ops {
>                        struct io_mm *io_mm, bool attach_domain);
>       void (*mm_detach)(struct iommu_domain *domain, struct device *dev,
>                         struct io_mm *io_mm, bool detach_domain);
> +     void (*mm_invalidate)(struct iommu_domain *domain, struct device *dev,
> +                           struct io_mm *io_mm, unsigned long vaddr,
> +                           size_t size);
>       int (*map)(struct iommu_domain *domain, unsigned long iova,
>                  phys_addr_t paddr, size_t size, int prot);
>       size_t (*unmap)(struct iommu_domain *domain, unsigned long iova,

_______________________________________________
iommu mailing list
[email protected]
https://lists.linuxfoundation.org/mailman/listinfo/iommu

Reply via email to