Hi,

On 08/10/2015 09:51 AM, Chris Wilson wrote:
> Michał Winiarski found a really evil way to trigger a struct_mutex
> deadlock with userptr. He found that if he allocated a userptr bo and
> then GTT mmaped another bo, or even itself, at the same address as the
> userptr using MAP_FIXED, he could then cause a deadlock any time we then
> had to invalidate the GTT mmappings (so at will). Tvrtko then found by
> repeatedly allocating GTT mmappings he could alias with an old userptr
> mmap and also trigger the deadlock.
> 
> To counter act the deadlock, we make the observation that we only need
> to take the struct_mutex if the object has any pages to revoke, and that
> before userspace can alias with the userptr address space, it must have
> invalidated the userptr->pages. Thus if we can check for those pages
> outside of the struct_mutex, we can avoid the deadlock. To do so we
> introduce a separate flag for userptr objects that we can inspect from
> the mmu-notifier underneath its spinlock.
> 
> The patch makes one eye-catching change. That is the removal serial=0
> after detecting a to-be-freed object inside the invalidate walker. I
> felt setting serial=0 was a questionable pessimisation: it denies us the
> chance to reuse the current iterator for the next loop (before it is
> freed) and being explicit makes the reader question the validity of the
> locking (since the object-free race could occur elsewhere). The
> serialisation of the iterator is through the spinlock, if the object is
> freed before the next loop then the notifier.serial will be incremented
> and we start the walk from the beginning as we detect the invalid cache.
> 
> To try and tame the error paths and interactions with the userptr->active
> flag, we have to do a fair amount of rearranging of get_pages_userptr().
> 
> v2: Grammar fixes
> v3: Reorder set-active so that it is only set when obj->pages is set
> (and so needs cancellation). Only the order of setting obj->pages and
> the active-flag is crucial. Calling gup after invalidate-range begin
> means the userptr sees the new set of backing storage (and so will not
> need to invalidate its new pages), but we have to be careful not to set
> the active-flag prior to successfully establishing obj->pages.
> v4: Take the active->flag early so we know in the mmu-notifier when we
> have to cancel a pending gup-worker.
> 
> Reported-by: Michał Winiarski <[email protected]>
> Testcase: igt/gem_userptr_blits/map-fixed*
> Signed-off-by: Chris Wilson <[email protected]>
> Cc: Michał Winiarski <[email protected]>
> Cc: Tvrtko Ursulin <[email protected]>
> Cc: [email protected]
> ---
>   drivers/gpu/drm/i915/i915_gem_userptr.c | 111 
> ++++++++++++++++++++++----------
>   1 file changed, 78 insertions(+), 33 deletions(-)
> 
> diff --git a/drivers/gpu/drm/i915/i915_gem_userptr.c 
> b/drivers/gpu/drm/i915/i915_gem_userptr.c
> index 800a5394aa1e..e21f885db87b 100644
> --- a/drivers/gpu/drm/i915/i915_gem_userptr.c
> +++ b/drivers/gpu/drm/i915/i915_gem_userptr.c
> @@ -59,6 +59,7 @@ struct i915_mmu_object {
>       struct interval_tree_node it;
>       struct list_head link;
>       struct drm_i915_gem_object *obj;
> +     bool active;
>       bool is_linear;
>   };
>   
> @@ -114,7 +115,8 @@ restart:
>   
>               obj = mo->obj;
>   
> -             if (!kref_get_unless_zero(&obj->base.refcount))
> +             if (!mo->active ||
> +                 !kref_get_unless_zero(&obj->base.refcount))
>                       continue;
>   
>               spin_unlock(&mn->lock);
> @@ -151,7 +153,8 @@ static void 
> i915_gem_userptr_mn_invalidate_range_start(struct mmu_notifier *_mn,
>               else
>                       it = interval_tree_iter_first(&mn->objects, start, end);
>               if (it != NULL) {
> -                     obj = container_of(it, struct i915_mmu_object, it)->obj;
> +                     struct i915_mmu_object *mo =
> +                             container_of(it, struct i915_mmu_object, it);
>   
>                       /* The mmu_object is released late when destroying the
>                        * GEM object so it is entirely possible to gain a
> @@ -160,11 +163,9 @@ static void 
> i915_gem_userptr_mn_invalidate_range_start(struct mmu_notifier *_mn,
>                        * the struct_mutex - and consequently use it after it
>                        * is freed and then double free it.
>                        */
> -                     if (!kref_get_unless_zero(&obj->base.refcount)) {
> -                             spin_unlock(&mn->lock);
> -                             serial = 0;
> -                             continue;
> -                     }
> +                     if (mo->active &&
> +                         kref_get_unless_zero(&mo->obj->base.refcount))
> +                             obj = mo->obj;
>   
>                       serial = mn->serial;
>               }
> @@ -566,6 +567,30 @@ __i915_gem_userptr_set_pages(struct drm_i915_gem_object 
> *obj,
>   }
>   
>   static void
> +__i915_gem_userptr_set_active(struct drm_i915_gem_object *obj,
> +                           bool value)
> +{
> +     /* During mm_invalidate_range we need to cancel any userptr that
> +      * overlaps the range being invalidated. Doing so requires the
> +      * struct_mutex, and that risks recursion. In order to cause
> +      * recursion, the user must alias the userptr address space with
> +      * a GTT mmapping (possible with a MAP_FIXED) - then when we have
> +      * to invalidate that mmaping, mm_invalidate_range is called with
> +      * the userptr address *and* the struct_mutex held.  To prevent that
> +      * we set a flag under the i915_mmu_notifier spinlock to indicate
> +      * whether this object is valid.
> +      */
> +#if defined(CONFIG_MMU_NOTIFIER)
> +     if (obj->userptr.mmu_object == NULL)
> +             return;
> +
> +     spin_lock(&obj->userptr.mmu_object->mn->lock);
> +     obj->userptr.mmu_object->active = value;
> +     spin_unlock(&obj->userptr.mmu_object->mn->lock);
> +#endif
> +}
> +
> +static void
>   __i915_gem_userptr_get_pages_worker(struct work_struct *_work)
>   {
>       struct get_pages_work *work = container_of(_work, typeof(*work), work);
> @@ -613,6 +638,8 @@ __i915_gem_userptr_get_pages_worker(struct work_struct 
> *_work)
>                       }
>               }
>               obj->userptr.work = ERR_PTR(ret);
> +             if (ret)
> +                     __i915_gem_userptr_set_active(obj, false);
>       }
>   
>       obj->userptr.workers--;
> @@ -649,6 +676,18 @@ i915_gem_userptr_get_pages(struct drm_i915_gem_object 
> *obj)
>        * to the vma (discard or cloning) which should prevent the more
>        * egregious cases from causing harm.
>        */
> +     if (IS_ERR(obj->userptr.work)) {
> +             /* active flag will have been dropped already by the worker */
> +             ret = PTR_ERR(obj->userptr.work);
> +             obj->userptr.work = NULL;
> +             return ret;
> +     }
> +     if (obj->userptr.work)
> +             /* active flag should still be held for the pending work */
> +             return -EAGAIN;
> +
> +     /* Let the mmu-notifier know that we have begun and need cancellation */
> +     __i915_gem_userptr_set_active(obj, true);
>   
>       pvec = NULL;
>       pinned = 0;
> @@ -657,8 +696,10 @@ i915_gem_userptr_get_pages(struct drm_i915_gem_object 
> *obj)
>                              GFP_TEMPORARY | __GFP_NOWARN | __GFP_NORETRY);
>               if (pvec == NULL) {
>                       pvec = drm_malloc_ab(num_pages, sizeof(struct page *));
> -                     if (pvec == NULL)
> -                             return -ENOMEM;
> +                     if (pvec == NULL) {
> +                             ret = -ENOMEM;
> +                             goto err;
> +                     }
>               }
>   
>               pinned = __get_user_pages_fast(obj->userptr.ptr, num_pages,
> @@ -667,7 +708,7 @@ i915_gem_userptr_get_pages(struct drm_i915_gem_object 
> *obj)
>       if (pinned < num_pages) {
>               if (pinned < 0) {
>                       ret = pinned;
> -                     pinned = 0;
> +                     goto err;
>               } else {
>                       /* Spawn a worker so that we can acquire the
>                        * user pages without holding our mutex. Access
> @@ -688,44 +729,47 @@ i915_gem_userptr_get_pages(struct drm_i915_gem_object 
> *obj)
>                        * that error back to this function through
>                        * obj->userptr.work = ERR_PTR.
>                        */
> +
>                       ret = -EAGAIN;
> -                     if (obj->userptr.work == NULL &&
> -                         obj->userptr.workers < 
> I915_GEM_USERPTR_MAX_WORKERS) {
> +                     if (obj->userptr.workers < 
> I915_GEM_USERPTR_MAX_WORKERS) {
>                               struct get_pages_work *work;
>   
>                               work = kmalloc(sizeof(*work), GFP_KERNEL);
> -                             if (work != NULL) {
> -                                     obj->userptr.work = &work->work;
> -                                     obj->userptr.workers++;
> +                             if (work == NULL) {
> +                                     ret = -ENOMEM;
> +                                     goto err;
> +                             }
> +                             obj->userptr.work = &work->work;
> +                             obj->userptr.workers++;
>   
> -                                     work->obj = obj;
> -                                     drm_gem_object_reference(&obj->base);
> +                             work->obj = obj;
> +                             drm_gem_object_reference(&obj->base);
>   
> -                                     work->task = current;
> -                                     get_task_struct(work->task);
> +                             work->task = current;
> +                             get_task_struct(work->task);
>   
> -                                     INIT_WORK(&work->work, 
> __i915_gem_userptr_get_pages_worker);
> -                                     schedule_work(&work->work);
> -                             } else
> -                                     ret = -ENOMEM;
> -                     } else {
> -                             if (IS_ERR(obj->userptr.work)) {
> -                                     ret = PTR_ERR(obj->userptr.work);
> -                                     obj->userptr.work = NULL;
> -                             }
> +                             INIT_WORK(&work->work,
> +                                       __i915_gem_userptr_get_pages_worker);
> +                             schedule_work(&work->work);
>                       }
> +                     goto err_active;
>               }
>       } else {
>               ret = __i915_gem_userptr_set_pages(obj, pvec, num_pages);
> -             if (ret == 0) {
> -                     obj->userptr.work = NULL;
> -                     pinned = 0;
> -             }
> +             if (ret)
> +                     goto err;
>       }
>   
> -     release_pages(pvec, pinned, 0);
> +out:
>       drm_free_large(pvec);
>       return ret;
> +
> +err:
> +     /* No pages here, no need for the mmu-notifier to wake us */
> +     __i915_gem_userptr_set_active(obj, false);
> +err_active:
> +     release_pages(pvec, pinned, 0);
> +     goto out;
>   }

I don't like the goto dance. Would something like the below be clearer?

========================================================================
static int
i915_gem_userptr_get_pages(struct drm_i915_gem_object *obj)
{
        const int num_pages = obj->base.size >> PAGE_SHIFT;
        struct page **pvec;
        int pinned, ret;

        /* If userspace should engineer that these pages are replaced in
         * the vma between us binding this page into the GTT and completion
         * of rendering... Their loss. If they change the mapping of their
         * pages they need to create a new bo to point to the new vma.
         *
         * However, that still leaves open the possibility of the vma
         * being copied upon fork. Which falls under the same userspace
         * synchronisation issue as a regular bo, except that this time
         * the process may not be expecting that a particular piece of
         * memory is tied to the GPU.
         *
         * Fortunately, we can hook into the mmu_notifier in order to
         * discard the page references prior to anything nasty happening
         * to the vma (discard or cloning) which should prevent the more
         * egregious cases from causing harm.
         */
        if (IS_ERR(obj->userptr.work)) {
                /* active flag will have been dropped already by the worker */
                ret = PTR_ERR(obj->userptr.work);
                obj->userptr.work = NULL;
                return ret;
        }
        if (obj->userptr.work)
                /* active flag should still be held for the pending work */
                return -EAGAIN;

        /* Let the mmu-notifier know that we have begun and need cancellation */
        __i915_gem_userptr_set_active(obj, true);

        pvec = NULL;
        pinned = 0;
        if (obj->userptr.mm->mm == current->mm) {
                pvec = kmalloc(num_pages*sizeof(struct page *),
                               GFP_TEMPORARY | __GFP_NOWARN | __GFP_NORETRY);
                if (pvec == NULL) {
                        pvec = drm_malloc_ab(num_pages, sizeof(struct page *));
                        if (pvec == NULL) {
                                ret = -ENOMEM;
                                goto err_pvec;
                        }
                }

                pinned = __get_user_pages_fast(obj->userptr.ptr, num_pages,
                                               !obj->userptr.read_only, pvec);
        }

        if (pinned == num_pages) {
                ret = __i915_gem_userptr_set_pages(obj, pvec, num_pages);
                if (ret)
                        release_pages(pvec, pinned, 0);
                goto out_gup;
        } else if (pinned < 0)
                ret = pinned;
                goto out_gup;
        }

        /* Spawn a worker so that we can acquire the
         * user pages without holding our mutex. Access
         * to the user pages requires mmap_sem, and we have
         * a strict lock ordering of mmap_sem, struct_mutex -
         * we already hold struct_mutex here and so cannot
         * call gup without encountering a lock inversion.
         *
         * Userspace will keep on repeating the operation
         * (thanks to EAGAIN) until either we hit the fast
         * path or the worker completes. If the worker is
         * cancelled or superseded, the task is still run
         * but the results ignored. (This leads to
         * complications that we may have a stray object
         * refcount that we need to be wary of when
         * checking for existing objects during creation.)
         * If the worker encounters an error, it reports
         * that error back to this function through
         * obj->userptr.work = ERR_PTR.
         */

        if (pvec) {
                release_pages(pvec, pinned, 0);
                drm_free_large(pvec);
        }
        
        ret = -EAGAIN;
        if (obj->userptr.workers < I915_GEM_USERPTR_MAX_WORKERS) {
                struct get_pages_work *work;

                work = kmalloc(sizeof(*work), GFP_KERNEL);
                if (work == NULL) {
                        ret = -ENOMEM;
                        goto err_pvec;
                }
                obj->userptr.work = &work->work;
                obj->userptr.workers++;

                work->obj = obj;
                drm_gem_object_reference(&obj->base);

                work->task = current;
                get_task_struct(work->task);

                INIT_WORK(&work->work,
                                __i915_gem_userptr_get_pages_worker);
                schedule_work(&work->work);
        }

        return ret;

out_gup:
        drm_free_large(pvec);
err_pvec:
        /* All done, no need for the mmu-notifier to wake us */
        __i915_gem_userptr_set_active(obj, false);

        return ret;
}
======================================================================

I think it is correct but please double check me. Maybe needs more comments
for the main three options (gup ok, gup fail, gup incomplete) but I think it
is worth if for nothing for avoidance of goto up-down jumps.

Regards,

Tvrtko
--
To unsubscribe from this list: send the line "unsubscribe stable" in
the body of a message to [email protected]
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Reply via email to