O Tue, Feb 08, 2022 at 04:21:19PM +0900, Damien Le Moal wrote:
> On 2/7/22 23:13, Nitesh Shetty wrote:
> > Introduce blkdev_issue_copy which supports source and destination bdevs,
> > and a array of (source, destination and copy length) tuples.
> 
> s/a/an
>

acked

> > Introduce REQ_COP copy offload operation flag. Create a read-write
> 
> REQ_COPY ?
>

acked

> > bio pair with a token as payload and submitted to the device in order.
> > the read request populates token with source specific information which
> > is then passed with write request.
> > Ths design is courtsey Mikulas Patocka<mpatocka@>'s token based copy
> 
> s/Ths design is courtsey/This design is courtesy of
>

acked

> > 
> > Larger copy operation may be divided if necessary by looking at device
> > limits.
> 
> may or will ?
> by looking at -> depending on the ?
> 

Larger copy will be divided, based on max_copy_sectors,max_copy_range_sector
limits. Will add in next series.

> > 
> > Signed-off-by: Nitesh Shetty <[email protected]>
> > Signed-off-by: SelvaKumar S <[email protected]>
> > Signed-off-by: Arnav Dawn <[email protected]>
> > ---
> >  block/blk-lib.c           | 216 ++++++++++++++++++++++++++++++++++++++
> >  block/blk-settings.c      |   2 +
> >  block/blk.h               |   2 +
> >  include/linux/blk_types.h |  20 ++++
> >  include/linux/blkdev.h    |   3 +
> >  include/uapi/linux/fs.h   |  14 +++
> >  6 files changed, 257 insertions(+)
> > 
> > diff --git a/block/blk-lib.c b/block/blk-lib.c
> > index 1b8ced45e4e5..3ae2c27b566e 100644
> > --- a/block/blk-lib.c
> > +++ b/block/blk-lib.c
> > @@ -135,6 +135,222 @@ int blkdev_issue_discard(struct block_device *bdev, 
> > sector_t sector,
> >  }
> >  EXPORT_SYMBOL(blkdev_issue_discard);
> >  
> > +/*
> > + * Wait on and process all in-flight BIOs.  This must only be called once
> > + * all bios have been issued so that the refcount can only decrease.
> > + * This just waits for all bios to make it through bio_copy_end_io. IO
> > + * errors are propagated through cio->io_error.
> > + */
> > +static int cio_await_completion(struct cio *cio)
> > +{
> > +   int ret = 0;
> > +
> > +   while (atomic_read(&cio->refcount)) {
> > +           cio->waiter = current;
> > +           __set_current_state(TASK_UNINTERRUPTIBLE);
> > +           blk_io_schedule();
> > +           /* wake up sets us TASK_RUNNING */
> > +           cio->waiter = NULL;
> > +           ret = cio->io_err;
> 
> Why is this in the loop ?
>

agree.

> > +   }
> > +   kvfree(cio);
> > +
> > +   return ret;
> > +}
> > +
> > +static void bio_copy_end_io(struct bio *bio)
> > +{
> > +   struct copy_ctx *ctx = bio->bi_private;
> > +   struct cio *cio = ctx->cio;
> > +   sector_t clen;
> > +   int ri = ctx->range_idx;
> > +
> > +   if (bio->bi_status) {
> > +           cio->io_err = bio->bi_status;
> > +           clen = (bio->bi_iter.bi_sector - ctx->start_sec) << 
> > SECTOR_SHIFT;
> > +           cio->rlist[ri].comp_len = min_t(sector_t, clen, 
> > cio->rlist[ri].comp_len);
> > +   }
> > +   __free_page(bio->bi_io_vec[0].bv_page);
> > +   kfree(ctx);
> > +   bio_put(bio);
> > +
> > +   if (atomic_dec_and_test(&cio->refcount) && cio->waiter)
> > +           wake_up_process(cio->waiter);
> 
> This looks racy: the cio->waiter test and wakeup are not atomic.

agreed, will remove atomic for refcount and add if check and wakeup in locks
in next version.

> > +}
> > +
> > +/*
> > + * blk_copy_offload        - Use device's native copy offload feature
> > + * Go through user provide payload, prepare new payload based on device's 
> > copy offload limits.
> > + */
> > +int blk_copy_offload(struct block_device *src_bdev, int nr_srcs,
> > +           struct range_entry *rlist, struct block_device *dst_bdev, gfp_t 
> > gfp_mask)
> > +{
> > +   struct request_queue *sq = bdev_get_queue(src_bdev);
> > +   struct request_queue *dq = bdev_get_queue(dst_bdev);
> > +   struct bio *read_bio, *write_bio;
> > +   struct copy_ctx *ctx;
> > +   struct cio *cio;
> > +   struct page *token;
> > +   sector_t src_blk, copy_len, dst_blk;
> > +   sector_t remaining, max_copy_len = LONG_MAX;
> > +   int ri = 0, ret = 0;
> > +
> > +   cio = kzalloc(sizeof(struct cio), GFP_KERNEL);
> > +   if (!cio)
> > +           return -ENOMEM;
> > +   atomic_set(&cio->refcount, 0);
> > +   cio->rlist = rlist;
> > +
> > +   max_copy_len = min3(max_copy_len, (sector_t)sq->limits.max_copy_sectors,
> > +                   (sector_t)dq->limits.max_copy_sectors);
> 
> sq->limits.max_copy_sectors is already by definition smaller than
> LONG_MAX, so there is no need for the min3 here.
>

acked

> > +   max_copy_len = min3(max_copy_len, 
> > (sector_t)sq->limits.max_copy_range_sectors,
> > +                   (sector_t)dq->limits.max_copy_range_sectors) << 
> > SECTOR_SHIFT;> +
> > +   for (ri = 0; ri < nr_srcs; ri++) {
> > +           cio->rlist[ri].comp_len = rlist[ri].len;
> > +           for (remaining = rlist[ri].len, src_blk = rlist[ri].src, 
> > dst_blk = rlist[ri].dst;
> > +                   remaining > 0;
> > +                   remaining -= copy_len, src_blk += copy_len, dst_blk += 
> > copy_len) {
> 
> This is unreadable.
> 

Sure, I will simplify the loops in next version.

> > +                   copy_len = min(remaining, max_copy_len);
> > +
> > +                   token = alloc_page(gfp_mask);
> > +                   if (unlikely(!token)) {
> > +                           ret = -ENOMEM;
> > +                           goto err_token;
> > +                   }
> > +
> > +                   read_bio = bio_alloc(src_bdev, 1, REQ_OP_READ | 
> > REQ_COPY | REQ_NOMERGE,
> > +                                   gfp_mask);
> > +                   if (!read_bio) {
> > +                           ret = -ENOMEM;
> > +                           goto err_read_bio;
> > +                   }
> > +                   read_bio->bi_iter.bi_sector = src_blk >> SECTOR_SHIFT;
> > +                   read_bio->bi_iter.bi_size = copy_len;
> > +                   __bio_add_page(read_bio, token, PAGE_SIZE, 0);
> > +                   ret = submit_bio_wait(read_bio);
> > +                   if (ret) {
> > +                           bio_put(read_bio);
> > +                           goto err_read_bio;
> > +                   }
> > +                   bio_put(read_bio);
> > +                   ctx = kzalloc(sizeof(struct copy_ctx), gfp_mask);
> > +                   if (!ctx) {
> > +                           ret = -ENOMEM;
> > +                           goto err_read_bio;
> > +                   }
> 
> This should be done before the read.
>

acked.

> > +                   ctx->cio = cio;
> > +                   ctx->range_idx = ri;
> > +                   ctx->start_sec = rlist[ri].src;
> > +
> > +                   write_bio = bio_alloc(dst_bdev, 1, REQ_OP_WRITE | 
> > REQ_COPY | REQ_NOMERGE,
> > +                                   gfp_mask);
> > +                   if (!write_bio) {
> > +                           ret = -ENOMEM;
> > +                           goto err_read_bio;
> > +                   }
> > +
> > +                   write_bio->bi_iter.bi_sector = dst_blk >> SECTOR_SHIFT;
> > +                   write_bio->bi_iter.bi_size = copy_len;
> > +                   __bio_add_page(write_bio, token, PAGE_SIZE, 0);
> > +                   write_bio->bi_end_io = bio_copy_end_io;
> > +                   write_bio->bi_private = ctx;
> > +                   atomic_inc(&cio->refcount);
> > +                   submit_bio(write_bio);
> > +           }
> > +   }
> > +
> > +   /* Wait for completion of all IO's*/
> > +   return cio_await_completion(cio);
> > +
> > +err_read_bio:
> > +   __free_page(token);
> > +err_token:
> > +   rlist[ri].comp_len = min_t(sector_t, rlist[ri].comp_len, (rlist[ri].len 
> > - remaining));
> > +
> > +   cio->io_err = ret;
> > +   return cio_await_completion(cio);
> > +}
> > +
> > +static inline int blk_copy_sanity_check(struct block_device *src_bdev,
> > +           struct block_device *dst_bdev, struct range_entry *rlist, int 
> > nr)
> > +{
> > +   unsigned int align_mask = max(
> > +                   bdev_logical_block_size(dst_bdev), 
> > bdev_logical_block_size(src_bdev)) - 1;
> > +   sector_t len = 0;
> > +   int i;
> > +
> > +   for (i = 0; i < nr; i++) {
> > +           if (rlist[i].len)
> > +                   len += rlist[i].len;
> > +           else
> > +                   return -EINVAL;
> > +           if ((rlist[i].dst & align_mask) || (rlist[i].src & align_mask) 
> > ||
> > +                           (rlist[i].len & align_mask))
> > +                   return -EINVAL;
> > +           rlist[i].comp_len = 0;
> > +   }
> > +
> > +   if (!len && len >= MAX_COPY_TOTAL_LENGTH)
> > +           return -EINVAL;
> > +
> > +   return 0;
> > +}
> > +
> > +static inline bool blk_check_copy_offload(struct request_queue *src_q,
> > +           struct request_queue *dest_q)
> > +{
> > +   if (dest_q->limits.copy_offload == BLK_COPY_OFFLOAD &&
> > +                   src_q->limits.copy_offload == BLK_COPY_OFFLOAD)
> > +           return true;
> > +
> > +   return false;
> > +}
> > +
> > +/*
> > + * blkdev_issue_copy - queue a copy
> > + * @src_bdev:      source block device
> > + * @nr_srcs:       number of source ranges to copy
> > + * @src_rlist:     array of source ranges
> > + * @dest_bdev:     destination block device
> > + * @gfp_mask:   memory allocation flags (for bio_alloc)
> > + * @flags: BLKDEV_COPY_* flags to control behaviour
> > + *
> > + * Description:
> > + * Copy source ranges from source block device to destination block device.
> > + * length of a source range cannot be zero.
> > + */
> > +int blkdev_issue_copy(struct block_device *src_bdev, int nr,
> > +           struct range_entry *rlist, struct block_device *dest_bdev,
> > +           gfp_t gfp_mask, int flags)
> > +{
> > +   struct request_queue *src_q = bdev_get_queue(src_bdev);
> > +   struct request_queue *dest_q = bdev_get_queue(dest_bdev);
> > +   int ret = -EINVAL;
> > +
> > +   if (!src_q || !dest_q)
> > +           return -ENXIO;
> > +
> > +   if (!nr)
> > +           return -EINVAL;
> > +
> > +   if (nr >= MAX_COPY_NR_RANGE)
> > +           return -EINVAL;
> > +
> > +   if (bdev_read_only(dest_bdev))
> > +           return -EPERM;
> > +
> > +   ret = blk_copy_sanity_check(src_bdev, dest_bdev, rlist, nr);
> > +   if (ret)
> > +           return ret;
> > +
> > +   if (blk_check_copy_offload(src_q, dest_q))
> > +           ret = blk_copy_offload(src_bdev, nr, rlist, dest_bdev, 
> > gfp_mask);
> > +
> > +   return ret;
> > +}
> > +EXPORT_SYMBOL(blkdev_issue_copy);
> > +
> >  /**
> >   * __blkdev_issue_write_same - generate number of bios with same page
> >   * @bdev:  target blockdev
> > diff --git a/block/blk-settings.c b/block/blk-settings.c
> > index 818454552cf8..4c8d48b8af25 100644
> > --- a/block/blk-settings.c
> > +++ b/block/blk-settings.c
> > @@ -545,6 +545,8 @@ int blk_stack_limits(struct queue_limits *t, struct 
> > queue_limits *b,
> >     t->max_segment_size = min_not_zero(t->max_segment_size,
> >                                        b->max_segment_size);
> >  
> > +   t->max_copy_sectors = min_not_zero(t->max_copy_sectors, 
> > b->max_copy_sectors);
> 
> Why min_not_zero ? If one of the underlying drive does not support copy
> offload, you cannot report that the top drive does.
>

agreed. Will update in next series.

> > +
> >     t->misaligned |= b->misaligned;
> >  
> >     alignment = queue_limit_alignment_offset(b, start);
> > diff --git a/block/blk.h b/block/blk.h
> > index abb663a2a147..94d2b055750b 100644
> > --- a/block/blk.h
> > +++ b/block/blk.h
> > @@ -292,6 +292,8 @@ static inline bool blk_may_split(struct request_queue 
> > *q, struct bio *bio)
> >             break;
> >     }
> >  
> > +   if (unlikely(op_is_copy(bio->bi_opf)))
> > +           return false;
> >     /*
> >      * All drivers must accept single-segments bios that are <= PAGE_SIZE.
> >      * This is a quick and dirty check that relies on the fact that
> > diff --git a/include/linux/blk_types.h b/include/linux/blk_types.h
> > index 5561e58d158a..0a3fee8ad61c 100644
> > --- a/include/linux/blk_types.h
> > +++ b/include/linux/blk_types.h
> > @@ -418,6 +418,7 @@ enum req_flag_bits {
> >     /* for driver use */
> >     __REQ_DRV,
> >     __REQ_SWAP,             /* swapping request. */
> > +   __REQ_COPY,             /* copy request*/
> >     __REQ_NR_BITS,          /* stops here */
> >  };
> >  
> > @@ -442,6 +443,7 @@ enum req_flag_bits {
> >  
> >  #define REQ_DRV                    (1ULL << __REQ_DRV)
> >  #define REQ_SWAP           (1ULL << __REQ_SWAP)
> > +#define REQ_COPY           (1ULL << __REQ_COPY)
> >  
> >  #define REQ_FAILFAST_MASK \
> >     (REQ_FAILFAST_DEV | REQ_FAILFAST_TRANSPORT | REQ_FAILFAST_DRIVER)
> > @@ -498,6 +500,11 @@ static inline bool op_is_discard(unsigned int op)
> >     return (op & REQ_OP_MASK) == REQ_OP_DISCARD;
> >  }
> >  
> > +static inline bool op_is_copy(unsigned int op)
> > +{
> > +   return (op & REQ_COPY);
> > +}
> > +
> >  /*
> >   * Check if a bio or request operation is a zone management operation, with
> >   * the exception of REQ_OP_ZONE_RESET_ALL which is treated as a special 
> > case
> > @@ -532,4 +539,17 @@ struct blk_rq_stat {
> >     u64 batch;
> >  };
> >  
> > +struct cio {
> > +   atomic_t refcount;
> > +   blk_status_t io_err;
> > +   struct range_entry *rlist;
> > +   struct task_struct *waiter;     /* waiting task (NULL if none) */
> > +};
> > +
> > +struct copy_ctx {
> > +   int range_idx;
> > +   sector_t start_sec;
> > +   struct cio *cio;
> > +};
> > +
> >  #endif /* __LINUX_BLK_TYPES_H */
> > diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
> > index f63ae50f1de3..15597488040c 100644
> > --- a/include/linux/blkdev.h
> > +++ b/include/linux/blkdev.h
> > @@ -1120,6 +1120,9 @@ extern int __blkdev_issue_discard(struct block_device 
> > *bdev, sector_t sector,
> >             struct bio **biop);
> >  struct bio *bio_map_kern(struct request_queue *q, void *data, unsigned int 
> > len,
> >             gfp_t gfp_mask);
> > +int blkdev_issue_copy(struct block_device *src_bdev, int nr_srcs,
> > +           struct range_entry *src_rlist, struct block_device *dest_bdev,
> > +           gfp_t gfp_mask, int flags);
> >  
> >  #define BLKDEV_ZERO_NOUNMAP        (1 << 0)  /* do not free blocks */
> >  #define BLKDEV_ZERO_NOFALLBACK     (1 << 1)  /* don't write explicit 
> > zeroes */
> > diff --git a/include/uapi/linux/fs.h b/include/uapi/linux/fs.h
> > index bdf7b404b3e7..55bca8f6e8ed 100644
> > --- a/include/uapi/linux/fs.h
> > +++ b/include/uapi/linux/fs.h
> > @@ -64,6 +64,20 @@ struct fstrim_range {
> >     __u64 minlen;
> >  };
> >  
> > +/* Maximum no of entries supported */
> > +#define MAX_COPY_NR_RANGE  (1 << 12)
> > +
> > +/* maximum total copy length */
> > +#define MAX_COPY_TOTAL_LENGTH      (1 << 21)
> > +
> > +/* Source range entry for copy */
> > +struct range_entry {
> > +   __u64 src;
> > +   __u64 dst;
> > +   __u64 len;
> > +   __u64 comp_len;
> > +};
> > +
> >  /* extent-same (dedupe) ioctls; these MUST match the btrfs ioctl 
> > definitions */
> >  #define FILE_DEDUPE_RANGE_SAME             0
> >  #define FILE_DEDUPE_RANGE_DIFFERS  1
> 
> 
> -- 
> Damien Le Moal
> Western Digital Research
>

 -- 
Thank you
Nitesh
--
dm-devel mailing list
[email protected]
https://listman.redhat.com/mailman/listinfo/dm-devel

Reply via email to