Re: [dm-devel] [PATCH v5 10/10] fs: add support for copy file range in zonefs

2022-11-30 Thread Damien Le Moal
On 11/30/22 13:17, Nitesh Shetty wrote:
> On Wed, Nov 30, 2022 at 08:45:55AM +0900, Damien Le Moal wrote:
>> On 11/29/22 21:22, Nitesh Shetty wrote:
>>> Acked. I do see a gap in current zonefs cfr implementation. I will drop this
>>
>> cfr ?
>>
> 
> yes, will drop zonefs cfr for next version.

I meant: I do not understand "cfr". I now realize that it probably means
copy-file-range ? Please be clear and do not use abbreviations.

-- 
Damien Le Moal
Western Digital Research

--
dm-devel mailing list
dm-devel@redhat.com
https://listman.redhat.com/mailman/listinfo/dm-devel



Re: [dm-devel] [PATCH v5 10/10] fs: add support for copy file range in zonefs

2022-11-30 Thread Nitesh Shetty
On Thu, Nov 24, 2022 at 10:47:55AM +0900, Damien Le Moal wrote:
> On 11/24/22 10:32, Damien Le Moal wrote:
> > On 11/23/22 14:58, Nitesh Shetty wrote:
> >> copy_file_range is implemented using copy offload,
> >> copy offloading to device is always enabled.
> >> To disable copy offloading mount with "no_copy_offload" mount option.
> > 
> > And were is the code that handle this option ?
> > 
> >> At present copy offload is only used, if the source and destination files
> >> are on same block device, otherwise copy file range is completed by
> >> generic copy file range.
> >>
> >> copy file range implemented as following:
> >>- write pending writes on the src and dest files
> >>- drop page cache for dest file if its conv zone
> >>- copy the range using offload
> >>- update dest file info
> >>
> >> For all failure cases we fallback to generic file copy range
> > 
> > For all cases ? That would be weird. What would be the point of trying to
> > copy again if e.g. the dest zone has gone offline or read only ?
> > 
> >> At present this implementation does not support conv aggregation
> > 
> > Please check this commit message overall: the grammar and punctuation
> > could really be improved.
> > 
> >>
> >> Signed-off-by: Nitesh Shetty 
> >> Signed-off-by: Anuj Gupta 
> >> ---
> >>  fs/zonefs/super.c | 179 ++
> >>  1 file changed, 179 insertions(+)
> >>
> >> diff --git a/fs/zonefs/super.c b/fs/zonefs/super.c
> >> index abc9a85106f2..15613433d4ae 100644
> >> --- a/fs/zonefs/super.c
> >> +++ b/fs/zonefs/super.c
> >> @@ -1223,6 +1223,183 @@ static int zonefs_file_release(struct inode 
> >> *inode, struct file *file)
> >>return 0;
> >>  }
> >>  
> >> +static int zonefs_is_file_copy_offset_ok(struct inode *src_inode,
> >> +  struct inode *dst_inode, loff_t src_off, loff_t dst_off,
> >> +  size_t *len)
> >> +{
> >> +  loff_t size, endoff;
> >> +  struct zonefs_inode_info *dst_zi = ZONEFS_I(dst_inode);
> >> +
> >> +  inode_lock(src_inode);
> >> +  size = i_size_read(src_inode);
> >> +  inode_unlock(src_inode);
> >> +  /* Don't copy beyond source file EOF. */
> >> +  if (src_off < size) {
> >> +  if (src_off + *len > size)
> >> +  *len = (size - (src_off + *len));
> >> +  } else
> >> +  *len = 0;
> > 
> > Missing curly brackets for the else.
> > 
> >> +
> >> +  mutex_lock(_zi->i_truncate_mutex);
> >> +  if (dst_zi->i_ztype == ZONEFS_ZTYPE_SEQ) {
> >> +  if (*len > dst_zi->i_max_size - dst_zi->i_wpoffset)
> >> +  *len -= dst_zi->i_max_size - dst_zi->i_wpoffset;
> >> +
> >> +  if (dst_off != dst_zi->i_wpoffset)
> >> +  goto err;
> >> +  }
> >> +  mutex_unlock(_zi->i_truncate_mutex);
> >> +
> >> +  endoff = dst_off + *len;
> >> +  inode_lock(dst_inode);
> >> +  if (endoff > dst_zi->i_max_size ||
> >> +  inode_newsize_ok(dst_inode, endoff)) {
> >> +  inode_unlock(dst_inode);
> >> +  goto err;
> > 
> > And here truncate mutex is not locked, but goto err will unlock it. This
> > is broken...
> > 
> >> +  }
> >> +  inode_unlock(dst_inode);
> > 
> > ...The locking is completely broken in this function anyway. You take the
> > lock, look at something, then release the lock. Then what if a write or a
> > trunctate comes in when the inode is not locked ? This is completely
> > broken. The inode should be locked with no dio pending when this function
> > is called. This is only to check if everything is ok. This has no business
> > playing with the inode and truncate locks.
> > 
> >> +
> >> +  return 0;
> >> +err:
> >> +  mutex_unlock(_zi->i_truncate_mutex);
> >> +  return -EINVAL;
> >> +}
> >> +
> >> +static ssize_t zonefs_issue_copy(struct zonefs_inode_info *src_zi,
> >> +  loff_t src_off, struct zonefs_inode_info *dst_zi,
> >> +  loff_t dst_off, size_t len)
> >> +{
> >> +  struct block_device *src_bdev = src_zi->i_vnode.i_sb->s_bdev;
> >> +  struct block_device *dst_bdev = dst_zi->i_vnode.i_sb->s_bdev;
> >> +  struct range_entry *rlist = NULL;
> >> +  int ret = len;
> >> +
> >> +  rlist = kmalloc(sizeof(*rlist), GFP_KERNEL);
> > 
> > GFP_NOIO ?
> > 
> >> +  if (!rlist)
> >> +  return -ENOMEM;
> >> +
> >> +  rlist[0].dst = (dst_zi->i_zsector << SECTOR_SHIFT) + dst_off;
> >> +  rlist[0].src = (src_zi->i_zsector << SECTOR_SHIFT) + src_off;
> >> +  rlist[0].len = len;
> >> +  rlist[0].comp_len = 0;
> >> +  ret = blkdev_issue_copy(src_bdev, dst_bdev, rlist, 1, NULL, NULL,
> >> +  GFP_KERNEL);
> >> +  if (rlist[0].comp_len > 0)
> >> +  ret = rlist[0].comp_len;
> >> +  kfree(rlist);
> >> +
> >> +  return ret;
> >> +}
> >> +
> >> +/* Returns length of possible copy, else returns error */
> >> +static ssize_t zonefs_copy_file_checks(struct file *src_file, loff_t 
> >> src_off,
> >> +  struct file *dst_file, loff_t dst_off,
> >> +  size_t 

Re: [dm-devel] [PATCH v5 10/10] fs: add support for copy file range in zonefs

2022-11-30 Thread Nitesh Shetty
On Wed, Nov 30, 2022 at 08:45:55AM +0900, Damien Le Moal wrote:
> On 11/29/22 21:22, Nitesh Shetty wrote:
> > Acked. I do see a gap in current zonefs cfr implementation. I will drop this
> 
> cfr ?
>

yes, will drop zonefs cfr for next version.

> > implementation for next version. Once we finalize on block copy offload
> > implementation, will re-pick this and send with above reviews fixed.
> > 
> > Thank you,
> > Nitesh
> 
> Please trim your replies.
> 

Noted

> 
> -- 
> Damien Le Moal
> Western Digital Research
> 
> 

Thanks,
Nitesh Shetty
--
dm-devel mailing list
dm-devel@redhat.com
https://listman.redhat.com/mailman/listinfo/dm-devel


Re: [dm-devel] [PATCH v5 10/10] fs: add support for copy file range in zonefs

2022-11-29 Thread Damien Le Moal
On 11/29/22 21:22, Nitesh Shetty wrote:
> Acked. I do see a gap in current zonefs cfr implementation. I will drop this

cfr ?

> implementation for next version. Once we finalize on block copy offload
> implementation, will re-pick this and send with above reviews fixed.
> 
> Thank you,
> Nitesh

Please trim your replies.


-- 
Damien Le Moal
Western Digital Research

--
dm-devel mailing list
dm-devel@redhat.com
https://listman.redhat.com/mailman/listinfo/dm-devel



Re: [dm-devel] [PATCH v5 10/10] fs: add support for copy file range in zonefs

2022-11-24 Thread Al Viro
On Thu, Nov 24, 2022 at 10:47:55AM +0900, Damien Le Moal wrote:
> >> +  inode_lock(src_inode);
> >> +  inode_lock(dst_inode);
> > 
> > So you did zonefs_copy_file_checks() outside of these locks, which mean
> > that everything about the source and destination files may have changed.
> > This does not work.
> 
> I forgot to mention that locking 2 inodes blindly like this can leads to
> deadlocks if another process tries a copy range from dst to src at the
> same time (lock order is reversed and so can deadlock).

Not to mention the deadlocks with existing places where fs/namei.c locks
two inodes...

--
dm-devel mailing list
dm-devel@redhat.com
https://listman.redhat.com/mailman/listinfo/dm-devel



Re: [dm-devel] [PATCH v5 10/10] fs: add support for copy file range in zonefs

2022-11-23 Thread Damien Le Moal
On 11/24/22 10:32, Damien Le Moal wrote:
> On 11/23/22 14:58, Nitesh Shetty wrote:
>> copy_file_range is implemented using copy offload,
>> copy offloading to device is always enabled.
>> To disable copy offloading mount with "no_copy_offload" mount option.
> 
> And were is the code that handle this option ?
> 
>> At present copy offload is only used, if the source and destination files
>> are on same block device, otherwise copy file range is completed by
>> generic copy file range.
>>
>> copy file range implemented as following:
>>  - write pending writes on the src and dest files
>>  - drop page cache for dest file if its conv zone
>>  - copy the range using offload
>>  - update dest file info
>>
>> For all failure cases we fallback to generic file copy range
> 
> For all cases ? That would be weird. What would be the point of trying to
> copy again if e.g. the dest zone has gone offline or read only ?
> 
>> At present this implementation does not support conv aggregation
> 
> Please check this commit message overall: the grammar and punctuation
> could really be improved.
> 
>>
>> Signed-off-by: Nitesh Shetty 
>> Signed-off-by: Anuj Gupta 
>> ---
>>  fs/zonefs/super.c | 179 ++
>>  1 file changed, 179 insertions(+)
>>
>> diff --git a/fs/zonefs/super.c b/fs/zonefs/super.c
>> index abc9a85106f2..15613433d4ae 100644
>> --- a/fs/zonefs/super.c
>> +++ b/fs/zonefs/super.c
>> @@ -1223,6 +1223,183 @@ static int zonefs_file_release(struct inode *inode, 
>> struct file *file)
>>  return 0;
>>  }
>>  
>> +static int zonefs_is_file_copy_offset_ok(struct inode *src_inode,
>> +struct inode *dst_inode, loff_t src_off, loff_t dst_off,
>> +size_t *len)
>> +{
>> +loff_t size, endoff;
>> +struct zonefs_inode_info *dst_zi = ZONEFS_I(dst_inode);
>> +
>> +inode_lock(src_inode);
>> +size = i_size_read(src_inode);
>> +inode_unlock(src_inode);
>> +/* Don't copy beyond source file EOF. */
>> +if (src_off < size) {
>> +if (src_off + *len > size)
>> +*len = (size - (src_off + *len));
>> +} else
>> +*len = 0;
> 
> Missing curly brackets for the else.
> 
>> +
>> +mutex_lock(_zi->i_truncate_mutex);
>> +if (dst_zi->i_ztype == ZONEFS_ZTYPE_SEQ) {
>> +if (*len > dst_zi->i_max_size - dst_zi->i_wpoffset)
>> +*len -= dst_zi->i_max_size - dst_zi->i_wpoffset;
>> +
>> +if (dst_off != dst_zi->i_wpoffset)
>> +goto err;
>> +}
>> +mutex_unlock(_zi->i_truncate_mutex);
>> +
>> +endoff = dst_off + *len;
>> +inode_lock(dst_inode);
>> +if (endoff > dst_zi->i_max_size ||
>> +inode_newsize_ok(dst_inode, endoff)) {
>> +inode_unlock(dst_inode);
>> +goto err;
> 
> And here truncate mutex is not locked, but goto err will unlock it. This
> is broken...
> 
>> +}
>> +inode_unlock(dst_inode);
> 
> ...The locking is completely broken in this function anyway. You take the
> lock, look at something, then release the lock. Then what if a write or a
> trunctate comes in when the inode is not locked ? This is completely
> broken. The inode should be locked with no dio pending when this function
> is called. This is only to check if everything is ok. This has no business
> playing with the inode and truncate locks.
> 
>> +
>> +return 0;
>> +err:
>> +mutex_unlock(_zi->i_truncate_mutex);
>> +return -EINVAL;
>> +}
>> +
>> +static ssize_t zonefs_issue_copy(struct zonefs_inode_info *src_zi,
>> +loff_t src_off, struct zonefs_inode_info *dst_zi,
>> +loff_t dst_off, size_t len)
>> +{
>> +struct block_device *src_bdev = src_zi->i_vnode.i_sb->s_bdev;
>> +struct block_device *dst_bdev = dst_zi->i_vnode.i_sb->s_bdev;
>> +struct range_entry *rlist = NULL;
>> +int ret = len;
>> +
>> +rlist = kmalloc(sizeof(*rlist), GFP_KERNEL);
> 
> GFP_NOIO ?
> 
>> +if (!rlist)
>> +return -ENOMEM;
>> +
>> +rlist[0].dst = (dst_zi->i_zsector << SECTOR_SHIFT) + dst_off;
>> +rlist[0].src = (src_zi->i_zsector << SECTOR_SHIFT) + src_off;
>> +rlist[0].len = len;
>> +rlist[0].comp_len = 0;
>> +ret = blkdev_issue_copy(src_bdev, dst_bdev, rlist, 1, NULL, NULL,
>> +GFP_KERNEL);
>> +if (rlist[0].comp_len > 0)
>> +ret = rlist[0].comp_len;
>> +kfree(rlist);
>> +
>> +return ret;
>> +}
>> +
>> +/* Returns length of possible copy, else returns error */
>> +static ssize_t zonefs_copy_file_checks(struct file *src_file, loff_t 
>> src_off,
>> +struct file *dst_file, loff_t dst_off,
>> +size_t *len, unsigned int flags)
>> +{
>> +struct inode *src_inode = file_inode(src_file);
>> +struct inode *dst_inode = file_inode(dst_file);
>> +struct zonefs_inode_info *src_zi = ZONEFS_I(src_inode);
>> +

Re: [dm-devel] [PATCH v5 10/10] fs: add support for copy file range in zonefs

2022-11-23 Thread Damien Le Moal
On 11/23/22 14:58, Nitesh Shetty wrote:
> copy_file_range is implemented using copy offload,
> copy offloading to device is always enabled.
> To disable copy offloading mount with "no_copy_offload" mount option.

And were is the code that handle this option ?

> At present copy offload is only used, if the source and destination files
> are on same block device, otherwise copy file range is completed by
> generic copy file range.
> 
> copy file range implemented as following:
>   - write pending writes on the src and dest files
>   - drop page cache for dest file if its conv zone
>   - copy the range using offload
>   - update dest file info
> 
> For all failure cases we fallback to generic file copy range

For all cases ? That would be weird. What would be the point of trying to
copy again if e.g. the dest zone has gone offline or read only ?

> At present this implementation does not support conv aggregation

Please check this commit message overall: the grammar and punctuation
could really be improved.

> 
> Signed-off-by: Nitesh Shetty 
> Signed-off-by: Anuj Gupta 
> ---
>  fs/zonefs/super.c | 179 ++
>  1 file changed, 179 insertions(+)
> 
> diff --git a/fs/zonefs/super.c b/fs/zonefs/super.c
> index abc9a85106f2..15613433d4ae 100644
> --- a/fs/zonefs/super.c
> +++ b/fs/zonefs/super.c
> @@ -1223,6 +1223,183 @@ static int zonefs_file_release(struct inode *inode, 
> struct file *file)
>   return 0;
>  }
>  
> +static int zonefs_is_file_copy_offset_ok(struct inode *src_inode,
> + struct inode *dst_inode, loff_t src_off, loff_t dst_off,
> + size_t *len)
> +{
> + loff_t size, endoff;
> + struct zonefs_inode_info *dst_zi = ZONEFS_I(dst_inode);
> +
> + inode_lock(src_inode);
> + size = i_size_read(src_inode);
> + inode_unlock(src_inode);
> + /* Don't copy beyond source file EOF. */
> + if (src_off < size) {
> + if (src_off + *len > size)
> + *len = (size - (src_off + *len));
> + } else
> + *len = 0;

Missing curly brackets for the else.

> +
> + mutex_lock(_zi->i_truncate_mutex);
> + if (dst_zi->i_ztype == ZONEFS_ZTYPE_SEQ) {
> + if (*len > dst_zi->i_max_size - dst_zi->i_wpoffset)
> + *len -= dst_zi->i_max_size - dst_zi->i_wpoffset;
> +
> + if (dst_off != dst_zi->i_wpoffset)
> + goto err;
> + }
> + mutex_unlock(_zi->i_truncate_mutex);
> +
> + endoff = dst_off + *len;
> + inode_lock(dst_inode);
> + if (endoff > dst_zi->i_max_size ||
> + inode_newsize_ok(dst_inode, endoff)) {
> + inode_unlock(dst_inode);
> + goto err;

And here truncate mutex is not locked, but goto err will unlock it. This
is broken...

> + }
> + inode_unlock(dst_inode);

...The locking is completely broken in this function anyway. You take the
lock, look at something, then release the lock. Then what if a write or a
trunctate comes in when the inode is not locked ? This is completely
broken. The inode should be locked with no dio pending when this function
is called. This is only to check if everything is ok. This has no business
playing with the inode and truncate locks.

> +
> + return 0;
> +err:
> + mutex_unlock(_zi->i_truncate_mutex);
> + return -EINVAL;
> +}
> +
> +static ssize_t zonefs_issue_copy(struct zonefs_inode_info *src_zi,
> + loff_t src_off, struct zonefs_inode_info *dst_zi,
> + loff_t dst_off, size_t len)
> +{
> + struct block_device *src_bdev = src_zi->i_vnode.i_sb->s_bdev;
> + struct block_device *dst_bdev = dst_zi->i_vnode.i_sb->s_bdev;
> + struct range_entry *rlist = NULL;
> + int ret = len;
> +
> + rlist = kmalloc(sizeof(*rlist), GFP_KERNEL);

GFP_NOIO ?

> + if (!rlist)
> + return -ENOMEM;
> +
> + rlist[0].dst = (dst_zi->i_zsector << SECTOR_SHIFT) + dst_off;
> + rlist[0].src = (src_zi->i_zsector << SECTOR_SHIFT) + src_off;
> + rlist[0].len = len;
> + rlist[0].comp_len = 0;
> + ret = blkdev_issue_copy(src_bdev, dst_bdev, rlist, 1, NULL, NULL,
> + GFP_KERNEL);
> + if (rlist[0].comp_len > 0)
> + ret = rlist[0].comp_len;
> + kfree(rlist);
> +
> + return ret;
> +}
> +
> +/* Returns length of possible copy, else returns error */
> +static ssize_t zonefs_copy_file_checks(struct file *src_file, loff_t src_off,
> + struct file *dst_file, loff_t dst_off,
> + size_t *len, unsigned int flags)
> +{
> + struct inode *src_inode = file_inode(src_file);
> + struct inode *dst_inode = file_inode(dst_file);
> + struct zonefs_inode_info *src_zi = ZONEFS_I(src_inode);
> + struct zonefs_inode_info *dst_zi = ZONEFS_I(dst_inode);
> + ssize_t ret;
> +
> + if (src_inode->i_sb != dst_inode->i_sb)
> + return 

Re: [dm-devel] [PATCH v5 10/10] fs: add support for copy file range in zonefs

2022-11-23 Thread Nitesh Shetty
On Wed, Nov 23, 2022 at 08:53:14AM +0200, Amir Goldstein wrote:
> On Wed, Nov 23, 2022 at 8:26 AM Nitesh Shetty  wrote:
> >
> > copy_file_range is implemented using copy offload,
> > copy offloading to device is always enabled.
> > To disable copy offloading mount with "no_copy_offload" mount option.
> > At present copy offload is only used, if the source and destination files
> > are on same block device, otherwise copy file range is completed by
> > generic copy file range.
> >
> > copy file range implemented as following:
> > - write pending writes on the src and dest files
> > - drop page cache for dest file if its conv zone
> > - copy the range using offload
> > - update dest file info
> >
> > For all failure cases we fallback to generic file copy range
> > At present this implementation does not support conv aggregation
> >
> > Signed-off-by: Nitesh Shetty 
> > Signed-off-by: Anuj Gupta 
> > ---
> >  fs/zonefs/super.c | 179 ++
> >  1 file changed, 179 insertions(+)
> >
> > diff --git a/fs/zonefs/super.c b/fs/zonefs/super.c
> > index abc9a85106f2..15613433d4ae 100644
> > --- a/fs/zonefs/super.c
> > +++ b/fs/zonefs/super.c
> > @@ -1223,6 +1223,183 @@ static int zonefs_file_release(struct inode *inode, 
> > struct file *file)
> > return 0;
> >  }
> >
> > +static int zonefs_is_file_copy_offset_ok(struct inode *src_inode,
> > +   struct inode *dst_inode, loff_t src_off, loff_t dst_off,
> > +   size_t *len)
> > +{
> > +   loff_t size, endoff;
> > +   struct zonefs_inode_info *dst_zi = ZONEFS_I(dst_inode);
> > +
> > +   inode_lock(src_inode);
> > +   size = i_size_read(src_inode);
> > +   inode_unlock(src_inode);
> > +   /* Don't copy beyond source file EOF. */
> > +   if (src_off < size) {
> > +   if (src_off + *len > size)
> > +   *len = (size - (src_off + *len));
> > +   } else
> > +   *len = 0;
> > +
> > +   mutex_lock(_zi->i_truncate_mutex);
> > +   if (dst_zi->i_ztype == ZONEFS_ZTYPE_SEQ) {
> > +   if (*len > dst_zi->i_max_size - dst_zi->i_wpoffset)
> > +   *len -= dst_zi->i_max_size - dst_zi->i_wpoffset;
> > +
> > +   if (dst_off != dst_zi->i_wpoffset)
> > +   goto err;
> > +   }
> > +   mutex_unlock(_zi->i_truncate_mutex);
> > +
> > +   endoff = dst_off + *len;
> > +   inode_lock(dst_inode);
> > +   if (endoff > dst_zi->i_max_size ||
> > +   inode_newsize_ok(dst_inode, endoff)) {
> > +   inode_unlock(dst_inode);
> > +   goto err;
> > +   }
> > +   inode_unlock(dst_inode);
> > +
> > +   return 0;
> > +err:
> > +   mutex_unlock(_zi->i_truncate_mutex);
> > +   return -EINVAL;
> > +}
> > +
> > +static ssize_t zonefs_issue_copy(struct zonefs_inode_info *src_zi,
> > +   loff_t src_off, struct zonefs_inode_info *dst_zi,
> > +   loff_t dst_off, size_t len)
> > +{
> > +   struct block_device *src_bdev = src_zi->i_vnode.i_sb->s_bdev;
> > +   struct block_device *dst_bdev = dst_zi->i_vnode.i_sb->s_bdev;
> > +   struct range_entry *rlist = NULL;
> > +   int ret = len;
> > +
> > +   rlist = kmalloc(sizeof(*rlist), GFP_KERNEL);
> > +   if (!rlist)
> > +   return -ENOMEM;
> > +
> > +   rlist[0].dst = (dst_zi->i_zsector << SECTOR_SHIFT) + dst_off;
> > +   rlist[0].src = (src_zi->i_zsector << SECTOR_SHIFT) + src_off;
> > +   rlist[0].len = len;
> > +   rlist[0].comp_len = 0;
> > +   ret = blkdev_issue_copy(src_bdev, dst_bdev, rlist, 1, NULL, NULL,
> > +   GFP_KERNEL);
> > +   if (rlist[0].comp_len > 0)
> > +   ret = rlist[0].comp_len;
> > +   kfree(rlist);
> > +
> > +   return ret;
> > +}
> > +
> > +/* Returns length of possible copy, else returns error */
> > +static ssize_t zonefs_copy_file_checks(struct file *src_file, loff_t 
> > src_off,
> > +   struct file *dst_file, loff_t 
> > dst_off,
> > +   size_t *len, unsigned int flags)
> > +{
> > +   struct inode *src_inode = file_inode(src_file);
> > +   struct inode *dst_inode = file_inode(dst_file);
> > +   struct zonefs_inode_info *src_zi = ZONEFS_I(src_inode);
> > +   struct zonefs_inode_info *dst_zi = ZONEFS_I(dst_inode);
> > +   ssize_t ret;
> > +
> > +   if (src_inode->i_sb != dst_inode->i_sb)
> > +   return -EXDEV;
> > +
> > +   /* Start by sync'ing the source and destination files for conv 
> > zones */
> > +   if (src_zi->i_ztype == ZONEFS_ZTYPE_CNV) {
> > +   ret = file_write_and_wait_range(src_file, src_off,
> > +   (src_off + *len));
> > +   if (ret < 0)
> > +   goto io_error;
> > +   }
> > +   

Re: [dm-devel] [PATCH v5 10/10] fs: add support for copy file range in zonefs

2022-11-23 Thread Amir Goldstein
On Wed, Nov 23, 2022 at 8:26 AM Nitesh Shetty  wrote:
>
> copy_file_range is implemented using copy offload,
> copy offloading to device is always enabled.
> To disable copy offloading mount with "no_copy_offload" mount option.
> At present copy offload is only used, if the source and destination files
> are on same block device, otherwise copy file range is completed by
> generic copy file range.
>
> copy file range implemented as following:
> - write pending writes on the src and dest files
> - drop page cache for dest file if its conv zone
> - copy the range using offload
> - update dest file info
>
> For all failure cases we fallback to generic file copy range
> At present this implementation does not support conv aggregation
>
> Signed-off-by: Nitesh Shetty 
> Signed-off-by: Anuj Gupta 
> ---
>  fs/zonefs/super.c | 179 ++
>  1 file changed, 179 insertions(+)
>
> diff --git a/fs/zonefs/super.c b/fs/zonefs/super.c
> index abc9a85106f2..15613433d4ae 100644
> --- a/fs/zonefs/super.c
> +++ b/fs/zonefs/super.c
> @@ -1223,6 +1223,183 @@ static int zonefs_file_release(struct inode *inode, 
> struct file *file)
> return 0;
>  }
>
> +static int zonefs_is_file_copy_offset_ok(struct inode *src_inode,
> +   struct inode *dst_inode, loff_t src_off, loff_t dst_off,
> +   size_t *len)
> +{
> +   loff_t size, endoff;
> +   struct zonefs_inode_info *dst_zi = ZONEFS_I(dst_inode);
> +
> +   inode_lock(src_inode);
> +   size = i_size_read(src_inode);
> +   inode_unlock(src_inode);
> +   /* Don't copy beyond source file EOF. */
> +   if (src_off < size) {
> +   if (src_off + *len > size)
> +   *len = (size - (src_off + *len));
> +   } else
> +   *len = 0;
> +
> +   mutex_lock(_zi->i_truncate_mutex);
> +   if (dst_zi->i_ztype == ZONEFS_ZTYPE_SEQ) {
> +   if (*len > dst_zi->i_max_size - dst_zi->i_wpoffset)
> +   *len -= dst_zi->i_max_size - dst_zi->i_wpoffset;
> +
> +   if (dst_off != dst_zi->i_wpoffset)
> +   goto err;
> +   }
> +   mutex_unlock(_zi->i_truncate_mutex);
> +
> +   endoff = dst_off + *len;
> +   inode_lock(dst_inode);
> +   if (endoff > dst_zi->i_max_size ||
> +   inode_newsize_ok(dst_inode, endoff)) {
> +   inode_unlock(dst_inode);
> +   goto err;
> +   }
> +   inode_unlock(dst_inode);
> +
> +   return 0;
> +err:
> +   mutex_unlock(_zi->i_truncate_mutex);
> +   return -EINVAL;
> +}
> +
> +static ssize_t zonefs_issue_copy(struct zonefs_inode_info *src_zi,
> +   loff_t src_off, struct zonefs_inode_info *dst_zi,
> +   loff_t dst_off, size_t len)
> +{
> +   struct block_device *src_bdev = src_zi->i_vnode.i_sb->s_bdev;
> +   struct block_device *dst_bdev = dst_zi->i_vnode.i_sb->s_bdev;
> +   struct range_entry *rlist = NULL;
> +   int ret = len;
> +
> +   rlist = kmalloc(sizeof(*rlist), GFP_KERNEL);
> +   if (!rlist)
> +   return -ENOMEM;
> +
> +   rlist[0].dst = (dst_zi->i_zsector << SECTOR_SHIFT) + dst_off;
> +   rlist[0].src = (src_zi->i_zsector << SECTOR_SHIFT) + src_off;
> +   rlist[0].len = len;
> +   rlist[0].comp_len = 0;
> +   ret = blkdev_issue_copy(src_bdev, dst_bdev, rlist, 1, NULL, NULL,
> +   GFP_KERNEL);
> +   if (rlist[0].comp_len > 0)
> +   ret = rlist[0].comp_len;
> +   kfree(rlist);
> +
> +   return ret;
> +}
> +
> +/* Returns length of possible copy, else returns error */
> +static ssize_t zonefs_copy_file_checks(struct file *src_file, loff_t src_off,
> +   struct file *dst_file, loff_t dst_off,
> +   size_t *len, unsigned int flags)
> +{
> +   struct inode *src_inode = file_inode(src_file);
> +   struct inode *dst_inode = file_inode(dst_file);
> +   struct zonefs_inode_info *src_zi = ZONEFS_I(src_inode);
> +   struct zonefs_inode_info *dst_zi = ZONEFS_I(dst_inode);
> +   ssize_t ret;
> +
> +   if (src_inode->i_sb != dst_inode->i_sb)
> +   return -EXDEV;
> +
> +   /* Start by sync'ing the source and destination files for conv zones 
> */
> +   if (src_zi->i_ztype == ZONEFS_ZTYPE_CNV) {
> +   ret = file_write_and_wait_range(src_file, src_off,
> +   (src_off + *len));
> +   if (ret < 0)
> +   goto io_error;
> +   }
> +   inode_dio_wait(src_inode);
> +
> +   /* Start by sync'ing the source and destination files ifor conv zones 
> */
> +   if (dst_zi->i_ztype == ZONEFS_ZTYPE_CNV) {
> +   ret = file_write_and_wait_range(dst_file, dst_off,
> +   (dst_off + *len));
> +   if (ret < 0)
> +

[dm-devel] [PATCH v5 10/10] fs: add support for copy file range in zonefs

2022-11-23 Thread Nitesh Shetty
copy_file_range is implemented using copy offload,
copy offloading to device is always enabled.
To disable copy offloading mount with "no_copy_offload" mount option.
At present copy offload is only used, if the source and destination files
are on same block device, otherwise copy file range is completed by
generic copy file range.

copy file range implemented as following:
- write pending writes on the src and dest files
- drop page cache for dest file if its conv zone
- copy the range using offload
- update dest file info

For all failure cases we fallback to generic file copy range
At present this implementation does not support conv aggregation

Signed-off-by: Nitesh Shetty 
Signed-off-by: Anuj Gupta 
---
 fs/zonefs/super.c | 179 ++
 1 file changed, 179 insertions(+)

diff --git a/fs/zonefs/super.c b/fs/zonefs/super.c
index abc9a85106f2..15613433d4ae 100644
--- a/fs/zonefs/super.c
+++ b/fs/zonefs/super.c
@@ -1223,6 +1223,183 @@ static int zonefs_file_release(struct inode *inode, 
struct file *file)
return 0;
 }
 
+static int zonefs_is_file_copy_offset_ok(struct inode *src_inode,
+   struct inode *dst_inode, loff_t src_off, loff_t dst_off,
+   size_t *len)
+{
+   loff_t size, endoff;
+   struct zonefs_inode_info *dst_zi = ZONEFS_I(dst_inode);
+
+   inode_lock(src_inode);
+   size = i_size_read(src_inode);
+   inode_unlock(src_inode);
+   /* Don't copy beyond source file EOF. */
+   if (src_off < size) {
+   if (src_off + *len > size)
+   *len = (size - (src_off + *len));
+   } else
+   *len = 0;
+
+   mutex_lock(_zi->i_truncate_mutex);
+   if (dst_zi->i_ztype == ZONEFS_ZTYPE_SEQ) {
+   if (*len > dst_zi->i_max_size - dst_zi->i_wpoffset)
+   *len -= dst_zi->i_max_size - dst_zi->i_wpoffset;
+
+   if (dst_off != dst_zi->i_wpoffset)
+   goto err;
+   }
+   mutex_unlock(_zi->i_truncate_mutex);
+
+   endoff = dst_off + *len;
+   inode_lock(dst_inode);
+   if (endoff > dst_zi->i_max_size ||
+   inode_newsize_ok(dst_inode, endoff)) {
+   inode_unlock(dst_inode);
+   goto err;
+   }
+   inode_unlock(dst_inode);
+
+   return 0;
+err:
+   mutex_unlock(_zi->i_truncate_mutex);
+   return -EINVAL;
+}
+
+static ssize_t zonefs_issue_copy(struct zonefs_inode_info *src_zi,
+   loff_t src_off, struct zonefs_inode_info *dst_zi,
+   loff_t dst_off, size_t len)
+{
+   struct block_device *src_bdev = src_zi->i_vnode.i_sb->s_bdev;
+   struct block_device *dst_bdev = dst_zi->i_vnode.i_sb->s_bdev;
+   struct range_entry *rlist = NULL;
+   int ret = len;
+
+   rlist = kmalloc(sizeof(*rlist), GFP_KERNEL);
+   if (!rlist)
+   return -ENOMEM;
+
+   rlist[0].dst = (dst_zi->i_zsector << SECTOR_SHIFT) + dst_off;
+   rlist[0].src = (src_zi->i_zsector << SECTOR_SHIFT) + src_off;
+   rlist[0].len = len;
+   rlist[0].comp_len = 0;
+   ret = blkdev_issue_copy(src_bdev, dst_bdev, rlist, 1, NULL, NULL,
+   GFP_KERNEL);
+   if (rlist[0].comp_len > 0)
+   ret = rlist[0].comp_len;
+   kfree(rlist);
+
+   return ret;
+}
+
+/* Returns length of possible copy, else returns error */
+static ssize_t zonefs_copy_file_checks(struct file *src_file, loff_t src_off,
+   struct file *dst_file, loff_t dst_off,
+   size_t *len, unsigned int flags)
+{
+   struct inode *src_inode = file_inode(src_file);
+   struct inode *dst_inode = file_inode(dst_file);
+   struct zonefs_inode_info *src_zi = ZONEFS_I(src_inode);
+   struct zonefs_inode_info *dst_zi = ZONEFS_I(dst_inode);
+   ssize_t ret;
+
+   if (src_inode->i_sb != dst_inode->i_sb)
+   return -EXDEV;
+
+   /* Start by sync'ing the source and destination files for conv zones */
+   if (src_zi->i_ztype == ZONEFS_ZTYPE_CNV) {
+   ret = file_write_and_wait_range(src_file, src_off,
+   (src_off + *len));
+   if (ret < 0)
+   goto io_error;
+   }
+   inode_dio_wait(src_inode);
+
+   /* Start by sync'ing the source and destination files ifor conv zones */
+   if (dst_zi->i_ztype == ZONEFS_ZTYPE_CNV) {
+   ret = file_write_and_wait_range(dst_file, dst_off,
+   (dst_off + *len));
+   if (ret < 0)
+   goto io_error;
+   }
+   inode_dio_wait(dst_inode);
+
+   /* Drop dst file cached pages for a conv zone*/
+   if (dst_zi->i_ztype == ZONEFS_ZTYPE_CNV) {
+   ret = invalidate_inode_pages2_range(dst_inode->i_mapping,
+   dst_off >>