On Thu, Jul 25, 2019 at 04:30:13PM -0400, Vivek Goyal wrote:
> On Thu, Jul 25, 2019 at 12:32:50PM -0700, Liu Bo wrote:
> > On Thu, Jul 25, 2019 at 11:35:21AM -0400, Vivek Goyal wrote:
> > > On Wed, Jul 24, 2019 at 05:08:58PM -0700, Liu Bo wrote:
> > > > On Wed, Jul 24, 2019 at 05:07:21PM -0400, Vivek Goyal wrote:
> > > > > Do not always map a dax mapping read-write. There are use cases like
> > > > > executing a file where we want to map file read-only. virtio-fs dir on
> > > > > host might be backed by overlayfs. We don't want to open file 
> > > > > read-write
> > > > > on overlayfs otherwise it will unnecessariliy be copied-up nullyifying
> > > > > the advantages of sharing page cache between vms for unmodified files.
> > > > > 
> > > > > Hence, create a read-only mapping if user did not ask for writable 
> > > > > mapping.
> > > > > Later upgrade it to read-write mapping when users requests it.
> > > > > 
> > > > > Signed-off-by: Vivek Goyal <[email protected]>
> > > > > ---
> > > > >  fs/fuse/file.c   | 114 
> > > > > ++++++++++++++++++++++++++++++++++++-----------
> > > > >  fs/fuse/fuse_i.h |   3 ++
> > > > >  2 files changed, 91 insertions(+), 26 deletions(-)
> > > > > 
> > > > > diff --git a/fs/fuse/file.c b/fs/fuse/file.c
> > > > > index a2c19e4a28b5..bcd8ddcc0fdf 100644
> > > > > --- a/fs/fuse/file.c
> > > > > +++ b/fs/fuse/file.c
> > > > > @@ -265,7 +265,8 @@ static void dmap_add_to_free_pool(struct 
> > > > > fuse_conn *fc,
> > > > >  
> > > > >  /* offset passed in should be aligned to FUSE_DAX_MEM_RANGE_SZ */
> > > > >  static int fuse_setup_one_mapping(struct inode *inode, loff_t offset,
> > > > > -                               struct fuse_dax_mapping *dmap)
> > > > > +                               struct fuse_dax_mapping *dmap, bool 
> > > > > writable,
> > > > > +                               bool upgrade)
> > > > >  {
> > > > >       struct fuse_conn *fc = get_fuse_conn(inode);
> > > > >       struct fuse_inode *fi = get_fuse_inode(inode);
> > > > > @@ -283,7 +284,8 @@ static int fuse_setup_one_mapping(struct inode 
> > > > > *inode, loff_t offset,
> > > > >       inarg.moffset = dmap->window_offset;
> > > > >       inarg.len = FUSE_DAX_MEM_RANGE_SZ;
> > > > >       inarg.flags |= FUSE_SETUPMAPPING_FLAG_READ;
> > > > > -     inarg.flags |= FUSE_SETUPMAPPING_FLAG_WRITE;
> > > > > +     if (writable)
> > > > > +             inarg.flags |= FUSE_SETUPMAPPING_FLAG_WRITE;
> > > > >       args.in.h.opcode = FUSE_SETUPMAPPING;
> > > > >       args.in.h.nodeid = fi->nodeid;
> > > > >       args.in.numargs = 1;
> > > > > @@ -296,26 +298,30 @@ static int fuse_setup_one_mapping(struct inode 
> > > > > *inode, loff_t offset,
> > > > >               return err;
> > > > >       }
> > > > >  
> > > > > -     pr_debug("fuse_setup_one_mapping() succeeded. offset=0x%llx 
> > > > > err=%zd\n", offset, err);
> > > > > +     pr_debug("fuse_setup_one_mapping() succeeded. offset=0x%llx 
> > > > > writable=%d"
> > > > > +              " err=%zd\n", offset, writable, err);
> > > > >  
> > > > > -     /*
> > > > > -      * We don't take a refernce on inode. inode is valid right now 
> > > > > and
> > > > > -      * when inode is going away, cleanup logic should first cleanup
> > > > > -      * dmap entries.
> > > > > -      *
> > > > > -      * TODO: Do we need to ensure that we are holding inode lock
> > > > > -      * as well.
> > > > > -      */
> > > > > -     dmap->inode = inode;
> > > > > -     dmap->start = offset;
> > > > > -     dmap->end = offset + FUSE_DAX_MEM_RANGE_SZ - 1;
> > > > > -     /* Protected by fi->i_dmap_sem */
> > > > > -     fuse_dax_interval_tree_insert(dmap, &fi->dmap_tree);
> > > > > -     fi->nr_dmaps++;
> > > > > -     spin_lock(&fc->lock);
> > > > > -     list_add_tail(&dmap->busy_list, &fc->busy_ranges);
> > > > > -     fc->nr_busy_ranges++;
> > > > > -     spin_unlock(&fc->lock);
> > > > > +     dmap->writable = writable;
> > > > > +     if (!upgrade) {
> > > > > +             /*
> > > > > +              * We don't take a refernce on inode. inode is valid 
> > > > > right now
> > > > > +              * and when inode is going away, cleanup logic should 
> > > > > first
> > > > > +              * cleanup dmap entries.
> > > > > +              *
> > > > > +              * TODO: Do we need to ensure that we are holding inode 
> > > > > lock
> > > > > +              * as well.
> > > > > +              */
> > > > > +             dmap->inode = inode;
> > > > > +             dmap->start = offset;
> > > > > +             dmap->end = offset + FUSE_DAX_MEM_RANGE_SZ - 1;
> > > > > +             /* Protected by fi->i_dmap_sem */
> > > > > +             fuse_dax_interval_tree_insert(dmap, &fi->dmap_tree);
> > > > > +             fi->nr_dmaps++;
> > > > > +             spin_lock(&fc->lock);
> > > > > +             list_add_tail(&dmap->busy_list, &fc->busy_ranges);
> > > > > +             fc->nr_busy_ranges++;
> > > > > +             spin_unlock(&fc->lock);
> > > > > +     }
> > > > >       return 0;
> > > > >  }
> > > > >  
> > > > > @@ -1890,6 +1896,7 @@ static int iomap_begin_setup_new_mapping(struct 
> > > > > inode *inode, loff_t pos,
> > > > >       struct fuse_conn *fc = get_fuse_conn(inode);
> > > > >       struct fuse_dax_mapping *dmap, *alloc_dmap = NULL;
> > > > >       int ret;
> > > > > +     bool writable = flags & IOMAP_WRITE;
> > > > >  
> > > > >       /* Can't do reclaim in fault path yet due to lock ordering.
> > > > >        * Read path takes shared inode lock and that's not sufficient
> > > > > @@ -1930,10 +1937,11 @@ static int 
> > > > > iomap_begin_setup_new_mapping(struct inode *inode, loff_t pos,
> > > > >  
> > > > >       /* Setup one mapping */
> > > > >       ret = fuse_setup_one_mapping(inode,
> > > > > -             ALIGN_DOWN(pos, FUSE_DAX_MEM_RANGE_SZ), alloc_dmap);
> > > > > +                                  ALIGN_DOWN(pos, 
> > > > > FUSE_DAX_MEM_RANGE_SZ),
> > > > > +                                  alloc_dmap, writable, false);
> > > > >       if (ret < 0) {
> > > > >               printk("fuse_setup_one_mapping() failed. err=%d"
> > > > > -                     " pos=0x%llx\n", ret, pos);
> > > > > +                     " pos=0x%llx, writable=%d\n", ret, pos, 
> > > > > writable);
> > > > >               dmap_add_to_free_pool(fc, alloc_dmap);
> > > > >               up_write(&fi->i_dmap_sem);
> > > > >               return ret;
> > > > > @@ -1943,6 +1951,45 @@ static int 
> > > > > iomap_begin_setup_new_mapping(struct inode *inode, loff_t pos,
> > > > >       return 0;
> > > > >  }
> > > > >  
> > > > > +static int iomap_begin_upgrade_mapping(struct inode *inode, loff_t 
> > > > > pos,
> > > > > +                                      loff_t length, unsigned flags,
> > > > > +                                      struct iomap *iomap)
> > > > > +{
> > > > > +     struct fuse_inode *fi = get_fuse_inode(inode);
> > > > > +     struct fuse_dax_mapping *dmap;
> > > > > +     int ret;
> > > > > +
> > > > > +     /*
> > > > > +      * Take exclusive lock so that only one caller can try to setup
> > > > > +      * mapping and others wait.
> > > > > +      */
> > > > > +     down_write(&fi->i_dmap_sem);
> > > > > +     dmap = fuse_dax_interval_tree_iter_first(&fi->dmap_tree, pos, 
> > > > > pos);
> > > > > +
> > > > > +     /* We are holding either inode lock or i_mmap_sem, and that 
> > > > > should
> > > > > +      * ensure that dmap can't reclaimed or truncated and it should 
> > > > > still
> > > > > +      * be there in tree despite the fact we dropped and re-acquired 
> > > > > the
> > > > > +      * lock.
> > > > > +      */
> > > > > +     if (WARN_ON(!dmap))
> > > > > +             return -EIO;
> > > > 
> > > > up_write() is missed here.
> > > 
> > > Good catch. Will fix it.
> > > 
> > > > 
> 
> [..]
> > > > And I posted a similar patch[1] but with inode_lock being dropped in
> > > > mind, anything wrong with that patch?
> > > 
> > > It was an old posting so did not pay much attention to it. Looking at
> > > it now and few things I notice.
> > > 
> > > - That patch will allocate and possibly wait for memory range and free
> > >   it up when upgrading from read-only to read-write. That's not required.
> > >
> > 
> > What I was trying to say is that the above assumption about "dmap is still 
> > in
> > tree despite dropping-acquiring lock" is doubtful if inode lock is not held 
> > by
> > reclaim code.
> 
> In that case we will take reference on dmap before dropping the lock to
> make sure dmap is not reclaimed in the time we dropped the lock and
> re-acquired it? And that means there is no need to allocate dmap for
> upgrade case.

Ah makes sense, once we pin it, it won't go away underneath.

Other than this, feel free to add

Reviewed-by: Liu Bo <[email protected]>

thanks,
-liubo
> 
> > 
> > There is a window between dropping and reacquiring lock, if it gets 
> > reclaimed,
> > we're supposed to continue on the allocation path.
> 
> Thanks
> Vivek

Reply via email to