Re: [stable] [07/24] tmpfs: fix race between umount and swapoff

Hugh Dickins Fri, 20 May 2011 21:53:15 -0700

On Thu, 19 May 2011, Greg KH wrote:
> 2.6.33-longterm review patch.  If anyone has any objections, please let us 
> know.


Witold has found that I screwed up the highmem case in this patch.
Please add the commit appended at the bottom - or else delay both
until the next 2.6.33-longterm if you prefer.

Thanks,
Hugh

> 
> ------------------
> Content-Length: 6159
> Lines: 197
> 
> From: Hugh Dickins <[email protected]>
> 
> commit 778dd893ae785c5fd505dac30b5fc40aae188bf1 upstream.
> 
> The use of igrab() in swapoff's shmem_unuse_inode() is just as vulnerable
> to umount as that in shmem_writepage().
> 
> Fix this instance by extending the protection of shmem_swaplist_mutex
> right across shmem_unuse_inode(): while it's on the list, the inode cannot
> be evicted (and the filesystem cannot be unmounted) without
> shmem_evict_inode() taking that mutex to remove it from the list.
> 
> But since shmem_writepage() might take that mutex, we should avoid making
> memory allocations or memcg charges while holding it: prepare them at the
> outer level in shmem_unuse().  When mem_cgroup_cache_charge() was
> originally placed, we didn't know until that point that the page from swap
> was actually a shmem page; but nowadays it's noted in the swap_map, so
> we're safe to charge upfront.  For the radix_tree, do as is done in
> shmem_getpage(): preload upfront, but don't pin to the cpu; so we make a
> habit of refreshing the node pool, but might dip into GFP_NOWAIT reserves
> on occasion if subsequently preempted.
> 
> With the allocation and charge moved out from shmem_unuse_inode(),
> we can also hold index map and info->lock over from finding the entry.
> 
> Signed-off-by: Hugh Dickins <[email protected]>
> Cc: Konstantin Khlebnikov <[email protected]>
> Signed-off-by: Andrew Morton <[email protected]>
> Signed-off-by: Linus Torvalds <[email protected]>
> Signed-off-by: Greg Kroah-Hartman <[email protected]>
> 
> ---
>  mm/shmem.c |   88 
> +++++++++++++++++++++++++++++--------------------------------
>  1 file changed, 43 insertions(+), 45 deletions(-)
> 
> --- a/mm/shmem.c
> +++ b/mm/shmem.c
> @@ -850,7 +850,7 @@ static inline int shmem_find_swp(swp_ent
>  
>  static int shmem_unuse_inode(struct shmem_inode_info *info, swp_entry_t 
> entry, struct page *page)
>  {
> -     struct inode *inode;
> +     struct address_space *mapping;
>       unsigned long idx;
>       unsigned long size;
>       unsigned long limit;
> @@ -873,8 +873,10 @@ static int shmem_unuse_inode(struct shme
>       if (size > SHMEM_NR_DIRECT)
>               size = SHMEM_NR_DIRECT;
>       offset = shmem_find_swp(entry, ptr, ptr+size);
> -     if (offset >= 0)
> +     if (offset >= 0) {
> +             shmem_swp_balance_unmap();
>               goto found;
> +     }
>       if (!info->i_indirect)
>               goto lost2;
>  
> @@ -912,11 +914,11 @@ static int shmem_unuse_inode(struct shme
>                       if (size > ENTRIES_PER_PAGE)
>                               size = ENTRIES_PER_PAGE;
>                       offset = shmem_find_swp(entry, ptr, ptr+size);
> -                     shmem_swp_unmap(ptr);
>                       if (offset >= 0) {
>                               shmem_dir_unmap(dir);
>                               goto found;
>                       }
> +                     shmem_swp_unmap(ptr);
>               }
>       }
>  lost1:
> @@ -926,8 +928,7 @@ lost2:
>       return 0;
>  found:
>       idx += offset;
> -     inode = igrab(&info->vfs_inode);
> -     spin_unlock(&info->lock);
> +     ptr += offset;
>  
>       /*
>        * Move _head_ to start search for next from here.
> @@ -938,37 +939,18 @@ found:
>        */
>       if (shmem_swaplist.next != &info->swaplist)
>               list_move_tail(&shmem_swaplist, &info->swaplist);
> -     mutex_unlock(&shmem_swaplist_mutex);
>  
> -     error = 1;
> -     if (!inode)
> -             goto out;
>       /*
> -      * Charge page using GFP_KERNEL while we can wait.
> -      * Charged back to the user(not to caller) when swap account is used.
> -      * add_to_page_cache() will be called with GFP_NOWAIT.
> +      * We rely on shmem_swaplist_mutex, not only to protect the swaplist,
> +      * but also to hold up shmem_evict_inode(): so inode cannot be freed
> +      * beneath us (pagelock doesn't help until the page is in pagecache).
>        */
> -     error = mem_cgroup_cache_charge(page, current->mm, GFP_KERNEL);
> -     if (error)
> -             goto out;
> -     error = radix_tree_preload(GFP_KERNEL);
> -     if (error) {
> -             mem_cgroup_uncharge_cache_page(page);
> -             goto out;
> -     }
> -     error = 1;
> -
> -     spin_lock(&info->lock);
> -     ptr = shmem_swp_entry(info, idx, NULL);
> -     if (ptr && ptr->val == entry.val) {
> -             error = add_to_page_cache_locked(page, inode->i_mapping,
> -                                             idx, GFP_NOWAIT);
> -             /* does mem_cgroup_uncharge_cache_page on error */
> -     } else  /* we must compensate for our precharge above */
> -             mem_cgroup_uncharge_cache_page(page);
> +     mapping = info->vfs_inode.i_mapping;
> +     error = add_to_page_cache_locked(page, mapping, idx, GFP_NOWAIT);
> +     /* which does mem_cgroup_uncharge_cache_page on error */
>  
>       if (error == -EEXIST) {
> -             struct page *filepage = find_get_page(inode->i_mapping, idx);
> +             struct page *filepage = find_get_page(mapping, idx);
>               error = 1;
>               if (filepage) {
>                       /*
> @@ -988,14 +970,8 @@ found:
>               swap_free(entry);
>               error = 1;      /* not an error, but entry was found */
>       }
> -     if (ptr)
> -             shmem_swp_unmap(ptr);
> +     shmem_swp_unmap(ptr);
>       spin_unlock(&info->lock);
> -     radix_tree_preload_end();
> -out:
> -     unlock_page(page);
> -     page_cache_release(page);
> -     iput(inode);            /* allows for NULL */
>       return error;
>  }
>  
> @@ -1007,6 +983,26 @@ int shmem_unuse(swp_entry_t entry, struc
>       struct list_head *p, *next;
>       struct shmem_inode_info *info;
>       int found = 0;
> +     int error;
> +
> +     /*
> +      * Charge page using GFP_KERNEL while we can wait, before taking
> +      * the shmem_swaplist_mutex which might hold up shmem_writepage().
> +      * Charged back to the user (not to caller) when swap account is used.
> +      * add_to_page_cache() will be called with GFP_NOWAIT.
> +      */
> +     error = mem_cgroup_cache_charge(page, current->mm, GFP_KERNEL);
> +     if (error)
> +             goto out;
> +     /*
> +      * Try to preload while we can wait, to not make a habit of
> +      * draining atomic reserves; but don't latch on to this cpu,
> +      * it's okay if sometimes we get rescheduled after this.
> +      */
> +     error = radix_tree_preload(GFP_KERNEL);
> +     if (error)
> +             goto uncharge;
> +     radix_tree_preload_end();
>  
>       mutex_lock(&shmem_swaplist_mutex);
>       list_for_each_safe(p, next, &shmem_swaplist) {
> @@ -1014,17 +1010,19 @@ int shmem_unuse(swp_entry_t entry, struc
>               found = shmem_unuse_inode(info, entry, page);
>               cond_resched();
>               if (found)
> -                     goto out;
> +                     break;
>       }
>       mutex_unlock(&shmem_swaplist_mutex);
> -     /*
> -      * Can some race bring us here?  We've been holding page lock,
> -      * so I think not; but would rather try again later than BUG()
> -      */
> +
> +uncharge:
> +     if (!found)
> +             mem_cgroup_uncharge_cache_page(page);
> +     if (found < 0)
> +             error = found;
> +out:
>       unlock_page(page);
>       page_cache_release(page);
> -out:
> -     return (found < 0) ? found : 0;
> +     return error;
>  }
>  
>  /*

commit e6c9366b2adb52cba64b359b3050200743c7568c
Author: Hugh Dickins <[email protected]>
Date:   Fri May 20 15:47:33 2011 -0700

    tmpfs: fix highmem swapoff crash regression
    
    Commit 778dd893ae78 ("tmpfs: fix race between umount and swapoff")
    forgot the new rules for strict atomic kmap nesting, causing
    
      WARNING: at arch/x86/mm/highmem_32.c:81
    
    from __kunmap_atomic(), then
    
      BUG: unable to handle kernel paging request at fffb9000
    
    from shmem_swp_set() when shmem_unuse_inode() is handling swapoff with
    highmem in use.  My disgrace again.
    
    See
      https://bugzilla.kernel.org/show_bug.cgi?id=35352
    
    Reported-by: Witold Baryluk <[email protected]>
    Signed-off-by: Hugh Dickins <[email protected]>
    Cc: [email protected]
    Signed-off-by: Linus Torvalds <[email protected]>

diff --git a/mm/shmem.c b/mm/shmem.c
index dfc7069..ba4ad28 100644
--- a/mm/shmem.c
+++ b/mm/shmem.c
@@ -916,11 +916,12 @@ static int shmem_unuse_inode(struct shmem_inode_info 
*info, swp_entry_t entry, s
                        if (size > ENTRIES_PER_PAGE)
                                size = ENTRIES_PER_PAGE;
                        offset = shmem_find_swp(entry, ptr, ptr+size);
+                       shmem_swp_unmap(ptr);
                        if (offset >= 0) {
                                shmem_dir_unmap(dir);
+                               ptr = shmem_swp_map(subdir);
                                goto found;
                        }
-                       shmem_swp_unmap(ptr);
                }
        }
 lost1:

_______________________________________________
stable mailing list
[email protected]
http://linux.kernel.org/mailman/listinfo/stable

Re: [stable] [07/24] tmpfs: fix race between umount and swapoff

Reply via email to