How to serialise page_faults against truncate/hole punch? For truncate, we firstly update isize and then truncate pagecache in order to avoid race against page fault. For punch_hole, we use lock_extent and truncate pagecache.
Although we have these rules to avoid the race, it's not easy to understand how they do that. This adds a new rw_semaphore mmap_sem in inode and grab it for writing over truncate, hole punching and for reading over page faults. Signed-off-by: Liu Bo <bo.li....@oracle.com> --- fs/btrfs/btrfs_inode.h | 7 +++++++ fs/btrfs/file.c | 40 +++++++++++++++++++++++----------------- fs/btrfs/inode.c | 14 ++++++++++++-- 3 files changed, 42 insertions(+), 19 deletions(-) diff --git a/fs/btrfs/btrfs_inode.h b/fs/btrfs/btrfs_inode.h index 1a8fa46..f3674fd 100644 --- a/fs/btrfs/btrfs_inode.h +++ b/fs/btrfs/btrfs_inode.h @@ -195,6 +195,13 @@ struct btrfs_inode { */ struct rw_semaphore dio_sem; + /* + * To serialise page fault with truncate/punch_hole operations. + * We have to make sure that new page cannot be faulted in a section + * of the inode that is being punched. + */ + struct rw_semaphore mmap_sem; + struct inode vfs_inode; }; diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c index 2d6ee1e..a5c375a 100644 --- a/fs/btrfs/file.c +++ b/fs/btrfs/file.c @@ -2298,11 +2298,12 @@ static int btrfs_filemap_page_mkwrite(struct vm_area_struct *vma, goto out; } + down_read(&BTRFS_I(inode)->mmap_sem); if (IS_DAX(inode)) ret = iomap_dax_fault(vma, vmf, &btrfs_iomap_ops); else ret = btrfs_page_mkwrite(vma, vmf); - + up_read(&BTRFS_I(inode)->mmap_sem); out: sb_end_pagefault(inode->i_sb); return ret; @@ -2316,10 +2317,12 @@ static int btrfs_filemap_fault(struct vm_area_struct *vma, struct vm_fault *vmf) if ((vmf->flags & FAULT_FLAG_WRITE) && IS_DAX(inode)) return btrfs_filemap_page_mkwrite(vma, vmf); + down_read(&BTRFS_I(inode)->mmap_sem); if (IS_DAX(inode)) ret = iomap_dax_fault(vma, vmf, &btrfs_iomap_ops); else ret = filemap_fault(vma, vmf); + up_read(&BTRFS_I(inode)->mmap_sem); return ret; } @@ -2335,17 +2338,13 @@ static int btrfs_filemap_pfn_mkwrite(struct vm_area_struct *vma, sb_start_pagefault(sb); file_update_time(vma->vm_file); - /* - * How to serialise against truncate/hole punch similar to page_mkwrite? - * For truncate, we firstly update isize and then truncate pagecache in - * order to avoid race against page fault. - * For punch_hole, we use lock_extent and truncate pagecache. - */ + down_read(&BTRFS_I(inode)->mmap_sem); size = (i_size_read(inode) + PAGE_SIZE - 1) >> PAGE_SHIFT; if (vmf->pgoff >= size) ret = VM_FAULT_SIGBUS; else ret = dax_pfn_mkwrite(vma, vmf); + up_read(&BTRFS_I(inode)->mmap_sem); sb_end_pagefault(sb); return ret; @@ -2576,6 +2575,13 @@ static int btrfs_punch_hole(struct inode *inode, loff_t offset, loff_t len) BTRFS_I(inode)->root->sectorsize) - 1; same_block = (BTRFS_BYTES_TO_BLKS(root->fs_info, offset)) == (BTRFS_BYTES_TO_BLKS(root->fs_info, offset + len - 1)); + + /* + * Prevent page faults from reinstantiating pages we have released + * from page cache. + */ + down_write(&BTRFS_I(inode)->mmap_sem); + /* * We needn't truncate any block which is beyond the end of the file * because we are sure there is no data there. @@ -2591,17 +2597,15 @@ static int btrfs_punch_hole(struct inode *inode, loff_t offset, loff_t len) } else { ret = 0; } - goto out_only_mutex; + goto out_mmap; } /* zero back part of the first block */ if (offset < ino_size) { truncated_block = true; ret = btrfs_truncate_block(inode, offset, 0, 0); - if (ret) { - inode_unlock(inode); - return ret; - } + if (ret) + goto out_mmap; } /* Check the aligned pages after the first unaligned page, @@ -2614,10 +2618,10 @@ static int btrfs_punch_hole(struct inode *inode, loff_t offset, loff_t len) offset = lockstart; ret = find_first_non_hole(inode, &offset, &len); if (ret < 0) - goto out_only_mutex; + goto out_mmap; if (ret && !len) { ret = 0; - goto out_only_mutex; + goto out_mmap; } lockstart = offset; } @@ -2628,7 +2632,7 @@ static int btrfs_punch_hole(struct inode *inode, loff_t offset, loff_t len) if (tail_len) { ret = find_first_non_hole(inode, &tail_start, &tail_len); if (unlikely(ret < 0)) - goto out_only_mutex; + goto out_mmap; if (!ret) { /* zero the front end of the last page */ if (tail_start + tail_len < ino_size) { @@ -2637,14 +2641,14 @@ static int btrfs_punch_hole(struct inode *inode, loff_t offset, loff_t len) tail_start + tail_len, 0, 1); if (ret) - goto out_only_mutex; + goto out_mmap; } } } if (lockend < lockstart) { ret = 0; - goto out_only_mutex; + goto out_mmap; } while (1) { @@ -2814,6 +2818,8 @@ static int btrfs_punch_hole(struct inode *inode, loff_t offset, loff_t len) out: unlock_extent_cached(&BTRFS_I(inode)->io_tree, lockstart, lockend, &cached_state, GFP_NOFS); +out_mmap: + up_write(&BTRFS_I(inode)->mmap_sem); out_only_mutex: if (!updated_inode && truncated_block && !ret && !err) { /* diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index 227ee4e..9851422 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -5069,14 +5069,21 @@ static int btrfs_setsize(struct inode *inode, struct iattr *attr) if (ret) return ret; - /* we don't support swapfiles, so vmtruncate shouldn't fail */ - truncate_setsize(inode, newsize); + /* + * Update isize first so that if upcoming unlock dio read won't + * race with truncate if they are beyond new isize. + */ + i_size_write(inode, newsize); /* Disable nonlocked read DIO to avoid the end less truncate */ btrfs_inode_block_unlocked_dio(inode); inode_dio_wait(inode); btrfs_inode_resume_unlocked_dio(inode); + down_write(&BTRFS_I(inode)->mmap_sem); + /* we don't support swapfiles, so vmtruncate shouldn't fail */ + truncate_pagecache(inode, newsize); + ret = btrfs_truncate(inode); if (ret && inode->i_nlink) { int err; @@ -5089,6 +5096,7 @@ static int btrfs_setsize(struct inode *inode, struct iattr *attr) */ trans = btrfs_join_transaction(root); if (IS_ERR(trans)) { + up_write(&BTRFS_I(inode)->mmap_sem); btrfs_orphan_del(NULL, inode); return ret; } @@ -5109,6 +5117,7 @@ static int btrfs_setsize(struct inode *inode, struct iattr *attr) if (IS_DAX(inode)) ret = btrfs_truncate_block(inode, newsize, 0, 0); } + up_write(&BTRFS_I(inode)->mmap_sem); } return ret; @@ -9877,6 +9886,7 @@ struct inode *btrfs_alloc_inode(struct super_block *sb) INIT_LIST_HEAD(&ei->delayed_iput); RB_CLEAR_NODE(&ei->rb_node); init_rwsem(&ei->dio_sem); + init_rwsem(&ei->mmap_sem); return inode; } -- 2.5.5 -- To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html