On Thu, 19 May 2011, Greg KH wrote: > 2.6.33-longterm review patch. If anyone has any objections, please let us > know.
Witold has found that I screwed up the highmem case in this patch. Please add the commit appended at the bottom - or else delay both until the next 2.6.33-longterm if you prefer. Thanks, Hugh > > ------------------ > Content-Length: 6159 > Lines: 197 > > From: Hugh Dickins <[email protected]> > > commit 778dd893ae785c5fd505dac30b5fc40aae188bf1 upstream. > > The use of igrab() in swapoff's shmem_unuse_inode() is just as vulnerable > to umount as that in shmem_writepage(). > > Fix this instance by extending the protection of shmem_swaplist_mutex > right across shmem_unuse_inode(): while it's on the list, the inode cannot > be evicted (and the filesystem cannot be unmounted) without > shmem_evict_inode() taking that mutex to remove it from the list. > > But since shmem_writepage() might take that mutex, we should avoid making > memory allocations or memcg charges while holding it: prepare them at the > outer level in shmem_unuse(). When mem_cgroup_cache_charge() was > originally placed, we didn't know until that point that the page from swap > was actually a shmem page; but nowadays it's noted in the swap_map, so > we're safe to charge upfront. For the radix_tree, do as is done in > shmem_getpage(): preload upfront, but don't pin to the cpu; so we make a > habit of refreshing the node pool, but might dip into GFP_NOWAIT reserves > on occasion if subsequently preempted. > > With the allocation and charge moved out from shmem_unuse_inode(), > we can also hold index map and info->lock over from finding the entry. > > Signed-off-by: Hugh Dickins <[email protected]> > Cc: Konstantin Khlebnikov <[email protected]> > Signed-off-by: Andrew Morton <[email protected]> > Signed-off-by: Linus Torvalds <[email protected]> > Signed-off-by: Greg Kroah-Hartman <[email protected]> > > --- > mm/shmem.c | 88 > +++++++++++++++++++++++++++++-------------------------------- > 1 file changed, 43 insertions(+), 45 deletions(-) > > --- a/mm/shmem.c > +++ b/mm/shmem.c > @@ -850,7 +850,7 @@ static inline int shmem_find_swp(swp_ent > > static int shmem_unuse_inode(struct shmem_inode_info *info, swp_entry_t > entry, struct page *page) > { > - struct inode *inode; > + struct address_space *mapping; > unsigned long idx; > unsigned long size; > unsigned long limit; > @@ -873,8 +873,10 @@ static int shmem_unuse_inode(struct shme > if (size > SHMEM_NR_DIRECT) > size = SHMEM_NR_DIRECT; > offset = shmem_find_swp(entry, ptr, ptr+size); > - if (offset >= 0) > + if (offset >= 0) { > + shmem_swp_balance_unmap(); > goto found; > + } > if (!info->i_indirect) > goto lost2; > > @@ -912,11 +914,11 @@ static int shmem_unuse_inode(struct shme > if (size > ENTRIES_PER_PAGE) > size = ENTRIES_PER_PAGE; > offset = shmem_find_swp(entry, ptr, ptr+size); > - shmem_swp_unmap(ptr); > if (offset >= 0) { > shmem_dir_unmap(dir); > goto found; > } > + shmem_swp_unmap(ptr); > } > } > lost1: > @@ -926,8 +928,7 @@ lost2: > return 0; > found: > idx += offset; > - inode = igrab(&info->vfs_inode); > - spin_unlock(&info->lock); > + ptr += offset; > > /* > * Move _head_ to start search for next from here. > @@ -938,37 +939,18 @@ found: > */ > if (shmem_swaplist.next != &info->swaplist) > list_move_tail(&shmem_swaplist, &info->swaplist); > - mutex_unlock(&shmem_swaplist_mutex); > > - error = 1; > - if (!inode) > - goto out; > /* > - * Charge page using GFP_KERNEL while we can wait. > - * Charged back to the user(not to caller) when swap account is used. > - * add_to_page_cache() will be called with GFP_NOWAIT. > + * We rely on shmem_swaplist_mutex, not only to protect the swaplist, > + * but also to hold up shmem_evict_inode(): so inode cannot be freed > + * beneath us (pagelock doesn't help until the page is in pagecache). > */ > - error = mem_cgroup_cache_charge(page, current->mm, GFP_KERNEL); > - if (error) > - goto out; > - error = radix_tree_preload(GFP_KERNEL); > - if (error) { > - mem_cgroup_uncharge_cache_page(page); > - goto out; > - } > - error = 1; > - > - spin_lock(&info->lock); > - ptr = shmem_swp_entry(info, idx, NULL); > - if (ptr && ptr->val == entry.val) { > - error = add_to_page_cache_locked(page, inode->i_mapping, > - idx, GFP_NOWAIT); > - /* does mem_cgroup_uncharge_cache_page on error */ > - } else /* we must compensate for our precharge above */ > - mem_cgroup_uncharge_cache_page(page); > + mapping = info->vfs_inode.i_mapping; > + error = add_to_page_cache_locked(page, mapping, idx, GFP_NOWAIT); > + /* which does mem_cgroup_uncharge_cache_page on error */ > > if (error == -EEXIST) { > - struct page *filepage = find_get_page(inode->i_mapping, idx); > + struct page *filepage = find_get_page(mapping, idx); > error = 1; > if (filepage) { > /* > @@ -988,14 +970,8 @@ found: > swap_free(entry); > error = 1; /* not an error, but entry was found */ > } > - if (ptr) > - shmem_swp_unmap(ptr); > + shmem_swp_unmap(ptr); > spin_unlock(&info->lock); > - radix_tree_preload_end(); > -out: > - unlock_page(page); > - page_cache_release(page); > - iput(inode); /* allows for NULL */ > return error; > } > > @@ -1007,6 +983,26 @@ int shmem_unuse(swp_entry_t entry, struc > struct list_head *p, *next; > struct shmem_inode_info *info; > int found = 0; > + int error; > + > + /* > + * Charge page using GFP_KERNEL while we can wait, before taking > + * the shmem_swaplist_mutex which might hold up shmem_writepage(). > + * Charged back to the user (not to caller) when swap account is used. > + * add_to_page_cache() will be called with GFP_NOWAIT. > + */ > + error = mem_cgroup_cache_charge(page, current->mm, GFP_KERNEL); > + if (error) > + goto out; > + /* > + * Try to preload while we can wait, to not make a habit of > + * draining atomic reserves; but don't latch on to this cpu, > + * it's okay if sometimes we get rescheduled after this. > + */ > + error = radix_tree_preload(GFP_KERNEL); > + if (error) > + goto uncharge; > + radix_tree_preload_end(); > > mutex_lock(&shmem_swaplist_mutex); > list_for_each_safe(p, next, &shmem_swaplist) { > @@ -1014,17 +1010,19 @@ int shmem_unuse(swp_entry_t entry, struc > found = shmem_unuse_inode(info, entry, page); > cond_resched(); > if (found) > - goto out; > + break; > } > mutex_unlock(&shmem_swaplist_mutex); > - /* > - * Can some race bring us here? We've been holding page lock, > - * so I think not; but would rather try again later than BUG() > - */ > + > +uncharge: > + if (!found) > + mem_cgroup_uncharge_cache_page(page); > + if (found < 0) > + error = found; > +out: > unlock_page(page); > page_cache_release(page); > -out: > - return (found < 0) ? found : 0; > + return error; > } > > /* commit e6c9366b2adb52cba64b359b3050200743c7568c Author: Hugh Dickins <[email protected]> Date: Fri May 20 15:47:33 2011 -0700 tmpfs: fix highmem swapoff crash regression Commit 778dd893ae78 ("tmpfs: fix race between umount and swapoff") forgot the new rules for strict atomic kmap nesting, causing WARNING: at arch/x86/mm/highmem_32.c:81 from __kunmap_atomic(), then BUG: unable to handle kernel paging request at fffb9000 from shmem_swp_set() when shmem_unuse_inode() is handling swapoff with highmem in use. My disgrace again. See https://bugzilla.kernel.org/show_bug.cgi?id=35352 Reported-by: Witold Baryluk <[email protected]> Signed-off-by: Hugh Dickins <[email protected]> Cc: [email protected] Signed-off-by: Linus Torvalds <[email protected]> diff --git a/mm/shmem.c b/mm/shmem.c index dfc7069..ba4ad28 100644 --- a/mm/shmem.c +++ b/mm/shmem.c @@ -916,11 +916,12 @@ static int shmem_unuse_inode(struct shmem_inode_info *info, swp_entry_t entry, s if (size > ENTRIES_PER_PAGE) size = ENTRIES_PER_PAGE; offset = shmem_find_swp(entry, ptr, ptr+size); + shmem_swp_unmap(ptr); if (offset >= 0) { shmem_dir_unmap(dir); + ptr = shmem_swp_map(subdir); goto found; } - shmem_swp_unmap(ptr); } } lost1: _______________________________________________ stable mailing list [email protected] http://linux.kernel.org/mailman/listinfo/stable
