Re: [dm-devel] [PATCH v5 10/10] fs: add support for copy file range in zonefs

Damien Le Moal Wed, 23 Nov 2022 17:49:26 -0800

On 11/24/22 10:32, Damien Le Moal wrote:
> On 11/23/22 14:58, Nitesh Shetty wrote:
>> copy_file_range is implemented using copy offload,
>> copy offloading to device is always enabled.
>> To disable copy offloading mount with "no_copy_offload" mount option.
> 
> And were is the code that handle this option ?
> 
>> At present copy offload is only used, if the source and destination files
>> are on same block device, otherwise copy file range is completed by
>> generic copy file range.
>>
>> copy file range implemented as following:
>>      - write pending writes on the src and dest files
>>      - drop page cache for dest file if its conv zone
>>      - copy the range using offload
>>      - update dest file info
>>
>> For all failure cases we fallback to generic file copy range
> 
> For all cases ? That would be weird. What would be the point of trying to
> copy again if e.g. the dest zone has gone offline or read only ?
> 
>> At present this implementation does not support conv aggregation
> 
> Please check this commit message overall: the grammar and punctuation
> could really be improved.
> 
>>
>> Signed-off-by: Nitesh Shetty <[email protected]>
>> Signed-off-by: Anuj Gupta <[email protected]>
>> ---
>>  fs/zonefs/super.c | 179 ++++++++++++++++++++++++++++++++++++++++++++++
>>  1 file changed, 179 insertions(+)
>>
>> diff --git a/fs/zonefs/super.c b/fs/zonefs/super.c
>> index abc9a85106f2..15613433d4ae 100644
>> --- a/fs/zonefs/super.c
>> +++ b/fs/zonefs/super.c
>> @@ -1223,6 +1223,183 @@ static int zonefs_file_release(struct inode *inode, 
>> struct file *file)
>>      return 0;
>>  }
>>  
>> +static int zonefs_is_file_copy_offset_ok(struct inode *src_inode,
>> +            struct inode *dst_inode, loff_t src_off, loff_t dst_off,
>> +            size_t *len)
>> +{
>> +    loff_t size, endoff;
>> +    struct zonefs_inode_info *dst_zi = ZONEFS_I(dst_inode);
>> +
>> +    inode_lock(src_inode);
>> +    size = i_size_read(src_inode);
>> +    inode_unlock(src_inode);
>> +    /* Don't copy beyond source file EOF. */
>> +    if (src_off < size) {
>> +            if (src_off + *len > size)
>> +                    *len = (size - (src_off + *len));
>> +    } else
>> +            *len = 0;
> 
> Missing curly brackets for the else.
> 
>> +
>> +    mutex_lock(&dst_zi->i_truncate_mutex);
>> +    if (dst_zi->i_ztype == ZONEFS_ZTYPE_SEQ) {
>> +            if (*len > dst_zi->i_max_size - dst_zi->i_wpoffset)
>> +                    *len -= dst_zi->i_max_size - dst_zi->i_wpoffset;
>> +
>> +            if (dst_off != dst_zi->i_wpoffset)
>> +                    goto err;
>> +    }
>> +    mutex_unlock(&dst_zi->i_truncate_mutex);
>> +
>> +    endoff = dst_off + *len;
>> +    inode_lock(dst_inode);
>> +    if (endoff > dst_zi->i_max_size ||
>> +                    inode_newsize_ok(dst_inode, endoff)) {
>> +            inode_unlock(dst_inode);
>> +            goto err;
> 
> And here truncate mutex is not locked, but goto err will unlock it. This
> is broken...
> 
>> +    }
>> +    inode_unlock(dst_inode);
> 
> ...The locking is completely broken in this function anyway. You take the
> lock, look at something, then release the lock. Then what if a write or a
> trunctate comes in when the inode is not locked ? This is completely
> broken. The inode should be locked with no dio pending when this function
> is called. This is only to check if everything is ok. This has no business
> playing with the inode and truncate locks.
> 
>> +
>> +    return 0;
>> +err:
>> +    mutex_unlock(&dst_zi->i_truncate_mutex);
>> +    return -EINVAL;
>> +}
>> +
>> +static ssize_t zonefs_issue_copy(struct zonefs_inode_info *src_zi,
>> +            loff_t src_off, struct zonefs_inode_info *dst_zi,
>> +            loff_t dst_off, size_t len)
>> +{
>> +    struct block_device *src_bdev = src_zi->i_vnode.i_sb->s_bdev;
>> +    struct block_device *dst_bdev = dst_zi->i_vnode.i_sb->s_bdev;
>> +    struct range_entry *rlist = NULL;
>> +    int ret = len;
>> +
>> +    rlist = kmalloc(sizeof(*rlist), GFP_KERNEL);
> 
> GFP_NOIO ?
> 
>> +    if (!rlist)
>> +            return -ENOMEM;
>> +
>> +    rlist[0].dst = (dst_zi->i_zsector << SECTOR_SHIFT) + dst_off;
>> +    rlist[0].src = (src_zi->i_zsector << SECTOR_SHIFT) + src_off;
>> +    rlist[0].len = len;
>> +    rlist[0].comp_len = 0;
>> +    ret = blkdev_issue_copy(src_bdev, dst_bdev, rlist, 1, NULL, NULL,
>> +                    GFP_KERNEL);
>> +    if (rlist[0].comp_len > 0)
>> +            ret = rlist[0].comp_len;
>> +    kfree(rlist);
>> +
>> +    return ret;
>> +}
>> +
>> +/* Returns length of possible copy, else returns error */
>> +static ssize_t zonefs_copy_file_checks(struct file *src_file, loff_t 
>> src_off,
>> +                                    struct file *dst_file, loff_t dst_off,
>> +                                    size_t *len, unsigned int flags)
>> +{
>> +    struct inode *src_inode = file_inode(src_file);
>> +    struct inode *dst_inode = file_inode(dst_file);
>> +    struct zonefs_inode_info *src_zi = ZONEFS_I(src_inode);
>> +    struct zonefs_inode_info *dst_zi = ZONEFS_I(dst_inode);
>> +    ssize_t ret;
>> +
>> +    if (src_inode->i_sb != dst_inode->i_sb)
>> +            return -EXDEV;
>> +
>> +    /* Start by sync'ing the source and destination files for conv zones */
>> +    if (src_zi->i_ztype == ZONEFS_ZTYPE_CNV) {
>> +            ret = file_write_and_wait_range(src_file, src_off,
>> +                            (src_off + *len));
>> +            if (ret < 0)
>> +                    goto io_error;
>> +    }
>> +    inode_dio_wait(src_inode);
> 
> That is not a "check". So having this in a function called
> zonefs_copy_file_checks() is a little strange.
> 
>> +
>> +    /* Start by sync'ing the source and destination files ifor conv zones */
> 
> Same comment repeated, with typos.
> 
>> +    if (dst_zi->i_ztype == ZONEFS_ZTYPE_CNV) {
>> +            ret = file_write_and_wait_range(dst_file, dst_off,
>> +                            (dst_off + *len));
>> +            if (ret < 0)
>> +                    goto io_error;
>> +    }
>> +    inode_dio_wait(dst_inode);
>> +
>> +    /* Drop dst file cached pages for a conv zone*/
>> +    if (dst_zi->i_ztype == ZONEFS_ZTYPE_CNV) {
>> +            ret = invalidate_inode_pages2_range(dst_inode->i_mapping,
>> +                            dst_off >> PAGE_SHIFT,
>> +                            (dst_off + *len) >> PAGE_SHIFT);
>> +            if (ret < 0)
>> +                    goto io_error;
>> +    }
>> +
>> +    ret = zonefs_is_file_copy_offset_ok(src_inode, dst_inode, src_off,
>> +                    dst_off, len);
>> +    if (ret < 0)
> 
> if (ret)
> 
>> +            return ret;
>> +
>> +    return *len;
>> +
>> +io_error:
>> +    zonefs_io_error(dst_inode, true);
>> +    return ret;
>> +}
>> +
>> +static ssize_t zonefs_copy_file(struct file *src_file, loff_t src_off,
>> +            struct file *dst_file, loff_t dst_off,
>> +            size_t len, unsigned int flags)
>> +{
>> +    struct inode *src_inode = file_inode(src_file);
>> +    struct inode *dst_inode = file_inode(dst_file);
>> +    struct zonefs_inode_info *src_zi = ZONEFS_I(src_inode);
>> +    struct zonefs_inode_info *dst_zi = ZONEFS_I(dst_inode);
>> +    ssize_t ret = 0, bytes;
>> +
>> +    inode_lock(src_inode);
>> +    inode_lock(dst_inode);
> 
> So you did zonefs_copy_file_checks() outside of these locks, which mean
> that everything about the source and destination files may have changed.
> This does not work.


I forgot to mention that locking 2 inodes blindly like this can leads to
deadlocks if another process tries a copy range from dst to src at the
same time (lock order is reversed and so can deadlock).

> 
>> +    bytes = zonefs_issue_copy(src_zi, src_off, dst_zi, dst_off, len);
>> +    if (bytes < 0)
>> +            goto unlock_exit;
>> +
>> +    ret += bytes;
>> +
>> +    file_update_time(dst_file);
>> +    mutex_lock(&dst_zi->i_truncate_mutex);
>> +    zonefs_update_stats(dst_inode, dst_off + bytes);
>> +    zonefs_i_size_write(dst_inode, dst_off + bytes);
>> +    dst_zi->i_wpoffset += bytes;
> 
> This is wierd. iszie for dst will be dst_zi->i_wpoffset. So please do:
> 
>       dst_zi->i_wpoffset += bytes;
>       zonefs_i_size_write(dst_inode, dst_zi->i_wpoffset);
> 
>> +    mutex_unlock(&dst_zi->i_truncate_mutex);
> 
> And you are not taking care of the accounting for active zones here. If
> the copy made the dst zone full, it is not active anymore. You need to
> call zonefs_account_active();
> 
>> +    /* if we still have some bytes left, do splice copy */
>> +    if (bytes && (bytes < len)) {
>> +            bytes = do_splice_direct(src_file, &src_off, dst_file,
>> +                                     &dst_off, len, flags);
> 
> No way.
> 
>> +            if (bytes > 0)
>> +                    ret += bytes;
>> +    }
>> +unlock_exit:
>> +    if (ret < 0)
>> +            zonefs_io_error(dst_inode, true);
> 
> How can you be sure that you even did an IO when you get an error ?
> zonefs_issue_copy() may have failed on its kmalloc() and no IO was done.
> 
>> +    inode_unlock(src_inode);
>> +    inode_unlock(dst_inode);
>> +    return ret;
>> +}
>> +
>> +static ssize_t zonefs_copy_file_range(struct file *src_file, loff_t src_off,
>> +                                  struct file *dst_file, loff_t dst_off,
>> +                                  size_t len, unsigned int flags)
>> +{
>> +    ssize_t ret = -EIO;
> 
> This does not need to be initialized.
> 
>> +
>> +    ret = zonefs_copy_file_checks(src_file, src_off, dst_file, dst_off,
>> +                                 &len, flags);
> 
> These checks need to be done for the generic implementation too, no ? Why
> would checking this automatically trigger the offload ? What if the device
> does not support offloading ?
> 
>> +    if (ret > 0)
>> +            ret = zonefs_copy_file(src_file, src_off, dst_file, dst_off,
>> +                                 len, flags);
> 
> return here, then no need for the else. But see above. This seems all
> broken to me.
> 
>> +    else if (ret < 0 && ret == -EXDEV)
>> +            ret = generic_copy_file_range(src_file, src_off, dst_file,
>> +                                          dst_off, len, flags);
>> +    return ret;
>> +}
>> +
>>  static const struct file_operations zonefs_file_operations = {
>>      .open           = zonefs_file_open,
>>      .release        = zonefs_file_release,
>> @@ -1234,6 +1411,7 @@ static const struct file_operations 
>> zonefs_file_operations = {
>>      .splice_read    = generic_file_splice_read,
>>      .splice_write   = iter_file_splice_write,
>>      .iopoll         = iocb_bio_iopoll,
>> +    .copy_file_range = zonefs_copy_file_range,
>>  };
>>  
>>  static struct kmem_cache *zonefs_inode_cachep;
>> @@ -1804,6 +1982,7 @@ static int zonefs_fill_super(struct super_block *sb, 
>> void *data, int silent)
>>      atomic_set(&sbi->s_active_seq_files, 0);
>>      sbi->s_max_active_seq_files = bdev_max_active_zones(sb->s_bdev);
>>  
>> +    /* set copy support by default */
> 
> What is this comment supposed to be for ?
> 
>>      ret = zonefs_read_super(sb);
>>      if (ret)
>>              return ret;
> 

-- 
Damien Le Moal
Western Digital Research

--
dm-devel mailing list
[email protected]
https://listman.redhat.com/mailman/listinfo/dm-devel

Re: [dm-devel] [PATCH v5 10/10] fs: add support for copy file range in zonefs

Reply via email to