Hi Vivek, On 2019/8/5 20:25, Vivek Goyal wrote: > On Sun, Aug 04, 2019 at 09:56:55AM +0800, piaojun wrote: >> Hi Vivek, >> >> Shall we downgrade dax mapping from rw to read-only, if current user >> open with O_RDONLY, and last user opened with O_RDWR? This will minimize >> the access authority. >> >> I wonder if my concern is necessary. And if it will cause frequent dax >> mapping? > > There is a chance that memory mapping will be reclaimed and setup as > read-only again.
I got it, and thanks for your reply. Thanks, Jun > > I will not worry about this at this point of time. > > Thanks > Vivek > >> >> Thanks, >> Jun >> >> On 2019/7/26 23:49, Vivek Goyal wrote: >>> Do not always map a dax mapping read-write. There are use cases like >>> executing a file where we want to map file read-only. virtio-fs dir on >>> host might be backed by overlayfs. We don't want to open file read-write >>> on overlayfs otherwise it will unnecessariliy be copied-up nullyifying >>> the advantages of sharing page cache between vms for unmodified files. >>> >>> Hence, create a read-only mapping if user did not ask for writable mapping. >>> Later upgrade it to read-write mapping when users requests it. >>> >>> Signed-off-by: Vivek Goyal <[email protected]> >>> --- >>> fs/fuse/file.c | 121 +++++++++++++++++++++++++++++++++++++---------- >>> fs/fuse/fuse_i.h | 3 ++ >>> 2 files changed, 98 insertions(+), 26 deletions(-) >>> >>> diff --git a/fs/fuse/file.c b/fs/fuse/file.c >>> index a2c19e4a28b5..5277de7028a6 100644 >>> --- a/fs/fuse/file.c >>> +++ b/fs/fuse/file.c >>> @@ -265,7 +265,8 @@ static void dmap_add_to_free_pool(struct fuse_conn *fc, >>> >>> /* offset passed in should be aligned to FUSE_DAX_MEM_RANGE_SZ */ >>> static int fuse_setup_one_mapping(struct inode *inode, loff_t offset, >>> - struct fuse_dax_mapping *dmap) >>> + struct fuse_dax_mapping *dmap, bool writable, >>> + bool upgrade) >>> { >>> struct fuse_conn *fc = get_fuse_conn(inode); >>> struct fuse_inode *fi = get_fuse_inode(inode); >>> @@ -283,7 +284,8 @@ static int fuse_setup_one_mapping(struct inode *inode, >>> loff_t offset, >>> inarg.moffset = dmap->window_offset; >>> inarg.len = FUSE_DAX_MEM_RANGE_SZ; >>> inarg.flags |= FUSE_SETUPMAPPING_FLAG_READ; >>> - inarg.flags |= FUSE_SETUPMAPPING_FLAG_WRITE; >>> + if (writable) >>> + inarg.flags |= FUSE_SETUPMAPPING_FLAG_WRITE; >>> args.in.h.opcode = FUSE_SETUPMAPPING; >>> args.in.h.nodeid = fi->nodeid; >>> args.in.numargs = 1; >>> @@ -296,26 +298,30 @@ static int fuse_setup_one_mapping(struct inode >>> *inode, loff_t offset, >>> return err; >>> } >>> >>> - pr_debug("fuse_setup_one_mapping() succeeded. offset=0x%llx err=%zd\n", >>> offset, err); >>> + pr_debug("fuse_setup_one_mapping() succeeded. offset=0x%llx writable=%d" >>> + " err=%zd\n", offset, writable, err); >>> >>> - /* >>> - * We don't take a refernce on inode. inode is valid right now and >>> - * when inode is going away, cleanup logic should first cleanup >>> - * dmap entries. >>> - * >>> - * TODO: Do we need to ensure that we are holding inode lock >>> - * as well. >>> - */ >>> - dmap->inode = inode; >>> - dmap->start = offset; >>> - dmap->end = offset + FUSE_DAX_MEM_RANGE_SZ - 1; >>> - /* Protected by fi->i_dmap_sem */ >>> - fuse_dax_interval_tree_insert(dmap, &fi->dmap_tree); >>> - fi->nr_dmaps++; >>> - spin_lock(&fc->lock); >>> - list_add_tail(&dmap->busy_list, &fc->busy_ranges); >>> - fc->nr_busy_ranges++; >>> - spin_unlock(&fc->lock); >>> + dmap->writable = writable; >>> + if (!upgrade) { >>> + /* >>> + * We don't take a refernce on inode. inode is valid right now >>> + * and when inode is going away, cleanup logic should first >>> + * cleanup dmap entries. >>> + * >>> + * TODO: Do we need to ensure that we are holding inode lock >>> + * as well. >>> + */ >>> + dmap->inode = inode; >>> + dmap->start = offset; >>> + dmap->end = offset + FUSE_DAX_MEM_RANGE_SZ - 1; >>> + /* Protected by fi->i_dmap_sem */ >>> + fuse_dax_interval_tree_insert(dmap, &fi->dmap_tree); >>> + fi->nr_dmaps++; >>> + spin_lock(&fc->lock); >>> + list_add_tail(&dmap->busy_list, &fc->busy_ranges); >>> + fc->nr_busy_ranges++; >>> + spin_unlock(&fc->lock); >>> + } >>> return 0; >>> } >>> >>> @@ -1890,6 +1896,7 @@ static int iomap_begin_setup_new_mapping(struct inode >>> *inode, loff_t pos, >>> struct fuse_conn *fc = get_fuse_conn(inode); >>> struct fuse_dax_mapping *dmap, *alloc_dmap = NULL; >>> int ret; >>> + bool writable = flags & IOMAP_WRITE; >>> >>> /* Can't do reclaim in fault path yet due to lock ordering. >>> * Read path takes shared inode lock and that's not sufficient >>> @@ -1930,10 +1937,11 @@ static int iomap_begin_setup_new_mapping(struct >>> inode *inode, loff_t pos, >>> >>> /* Setup one mapping */ >>> ret = fuse_setup_one_mapping(inode, >>> - ALIGN_DOWN(pos, FUSE_DAX_MEM_RANGE_SZ), alloc_dmap); >>> + ALIGN_DOWN(pos, FUSE_DAX_MEM_RANGE_SZ), >>> + alloc_dmap, writable, false); >>> if (ret < 0) { >>> printk("fuse_setup_one_mapping() failed. err=%d" >>> - " pos=0x%llx\n", ret, pos); >>> + " pos=0x%llx, writable=%d\n", ret, pos, writable); >>> dmap_add_to_free_pool(fc, alloc_dmap); >>> up_write(&fi->i_dmap_sem); >>> return ret; >>> @@ -1943,6 +1951,52 @@ static int iomap_begin_setup_new_mapping(struct >>> inode *inode, loff_t pos, >>> return 0; >>> } >>> >>> +static int iomap_begin_upgrade_mapping(struct inode *inode, loff_t pos, >>> + loff_t length, unsigned flags, >>> + struct iomap *iomap) >>> +{ >>> + struct fuse_inode *fi = get_fuse_inode(inode); >>> + struct fuse_dax_mapping *dmap; >>> + int ret; >>> + >>> + /* >>> + * Take exclusive lock so that only one caller can try to setup >>> + * mapping and others wait. >>> + */ >>> + down_write(&fi->i_dmap_sem); >>> + dmap = fuse_dax_interval_tree_iter_first(&fi->dmap_tree, pos, pos); >>> + >>> + /* We are holding either inode lock or i_mmap_sem, and that should >>> + * ensure that dmap can't reclaimed or truncated and it should still >>> + * be there in tree despite the fact we dropped and re-acquired the >>> + * lock. >>> + */ >>> + ret = -EIO; >>> + if (WARN_ON(!dmap)) >>> + goto out_err; >>> + >>> + /* Maybe another thread already upgraded mapping while we were not >>> + * holding lock. >>> + */ >>> + if (dmap->writable) >>> + goto out_fill_iomap; >>> + >>> + ret = fuse_setup_one_mapping(inode, >>> + ALIGN_DOWN(pos, FUSE_DAX_MEM_RANGE_SZ), >>> + dmap, true, true); >>> + if (ret < 0) { >>> + printk("fuse_setup_one_mapping() failed. err=%d pos=0x%llx\n", >>> + ret, pos); >>> + goto out_err; >>> + } >>> + >>> +out_fill_iomap: >>> + fuse_fill_iomap(inode, pos, length, iomap, dmap, flags); >>> +out_err: >>> + up_write(&fi->i_dmap_sem); >>> + return ret; >>> +} >>> + >>> /* This is just for DAX and the mapping is ephemeral, do not use it for >>> other >>> * purposes since there is no block device with a permanent mapping. >>> */ >>> @@ -1952,6 +2006,7 @@ static int fuse_iomap_begin(struct inode *inode, >>> loff_t pos, loff_t length, >>> struct fuse_inode *fi = get_fuse_inode(inode); >>> struct fuse_conn *fc = get_fuse_conn(inode); >>> struct fuse_dax_mapping *dmap; >>> + bool writable = flags & IOMAP_WRITE; >>> >>> /* We don't support FIEMAP */ >>> BUG_ON(flags & IOMAP_REPORT); >>> @@ -1982,9 +2037,23 @@ static int fuse_iomap_begin(struct inode *inode, >>> loff_t pos, loff_t length, >>> dmap = fuse_dax_interval_tree_iter_first(&fi->dmap_tree, pos, pos); >>> >>> if (dmap) { >>> - fuse_fill_iomap(inode, pos, length, iomap, dmap, flags); >>> - up_read(&fi->i_dmap_sem); >>> - return 0; >>> + if (writable && !dmap->writable) { >>> + /* Upgrade read-only mapping to read-write. This will >>> + * require exclusive i_dmap_sem lock as we don't want >>> + * two threads to be trying to this simultaneously >>> + * for same dmap. So drop shared lock and acquire >>> + * exclusive lock. >>> + */ >>> + up_read(&fi->i_dmap_sem); >>> + pr_debug("%s: Upgrading mapping at offset 0x%llx" >>> + " length 0x%llx\n", __func__, pos, length); >>> + return iomap_begin_upgrade_mapping(inode, pos, length, >>> + flags, iomap); >>> + } else { >>> + fuse_fill_iomap(inode, pos, length, iomap, dmap, flags); >>> + up_read(&fi->i_dmap_sem); >>> + return 0; >>> + } >>> } else { >>> up_read(&fi->i_dmap_sem); >>> pr_debug("%s: no mapping at offset 0x%llx length 0x%llx\n", >>> diff --git a/fs/fuse/fuse_i.h b/fs/fuse/fuse_i.h >>> index e899a06e29d7..34ca8b90a1e1 100644 >>> --- a/fs/fuse/fuse_i.h >>> +++ b/fs/fuse/fuse_i.h >>> @@ -134,6 +134,9 @@ struct fuse_dax_mapping { >>> >>> /** Length of mapping, in bytes */ >>> loff_t length; >>> + >>> + /* Is this mapping read-only or read-write */ >>> + bool writable; >>> }; >>> >>> /** FUSE inode */ >>> > . >
