From: Nitesh Shetty <[email protected]>

Introduce REQ_OP_COPY, a no-merge copy offload operation. Create
bio with control information as payload and submit to the device.
Larger copy operation may be divided if necessary by looking at device
limits. REQ_OP_COPY(19) is a write op and takes zone_write_lock when
submitted to zoned device.
Native copy offload is not supported for stacked devices.

Signed-off-by: Nitesh Shetty <[email protected]>
Signed-off-by: SelvaKumar S <[email protected]>
---
 block/blk-core.c          |  84 ++++++++++++-
 block/blk-lib.c           | 252 ++++++++++++++++++++++++++++++++++++++
 block/blk-zoned.c         |   1 +
 block/bounce.c            |   1 +
 include/linux/bio.h       |   1 +
 include/linux/blk_types.h |  20 +++
 include/linux/blkdev.h    |  13 ++
 include/uapi/linux/fs.h   |  12 ++
 8 files changed, 378 insertions(+), 6 deletions(-)

diff --git a/block/blk-core.c b/block/blk-core.c
index d2722ecd4d9b..541b1561b4af 100644
--- a/block/blk-core.c
+++ b/block/blk-core.c
@@ -704,6 +704,17 @@ static noinline int should_fail_bio(struct bio *bio)
 }
 ALLOW_ERROR_INJECTION(should_fail_bio, ERRNO);
 
+static inline int bio_check_copy_eod(struct bio *bio, sector_t start,
+               sector_t nr_sectors, sector_t max_sect)
+{
+       if (nr_sectors && max_sect &&
+           (nr_sectors > max_sect || start > max_sect - nr_sectors)) {
+               handle_bad_sector(bio, max_sect);
+               return -EIO;
+       }
+       return 0;
+}
+
 /*
  * Check whether this bio extends beyond the end of the device or partition.
  * This may well happen - the kernel calls bread() without checking the size of
@@ -723,6 +734,61 @@ static inline int bio_check_eod(struct bio *bio)
        return 0;
 }
 
+/*
+ * check for eod limits and remap ranges if needed
+ */
+static int blk_check_copy(struct bio *bio)
+{
+       struct blk_copy_payload *payload = bio_data(bio);
+       sector_t dst_max_sect, dst_start_sect, copy_size = 0;
+       sector_t src_max_sect, src_start_sect;
+       struct block_device *bd_part;
+       int i, ret = -EIO;
+
+       rcu_read_lock();
+
+       bd_part = bio->bi_bdev;
+       if (unlikely(!bd_part))
+               goto err;
+
+       dst_max_sect =  bdev_nr_sectors(bd_part);
+       dst_start_sect = bd_part->bd_start_sect;
+
+       src_max_sect = bdev_nr_sectors(payload->src_bdev);
+       src_start_sect = payload->src_bdev->bd_start_sect;
+
+       if (unlikely(should_fail_request(bd_part, bio->bi_iter.bi_size)))
+               goto err;
+
+       if (unlikely(bio_check_ro(bio)))
+               goto err;
+
+       rcu_read_unlock();
+
+       for (i = 0; i < payload->copy_nr_ranges; i++) {
+               ret = bio_check_copy_eod(bio, payload->range[i].src,
+                               payload->range[i].len, src_max_sect);
+               if (unlikely(ret))
+                       goto out;
+
+               payload->range[i].src += src_start_sect;
+               copy_size += payload->range[i].len;
+       }
+
+       /* check if copy length crosses eod */
+       ret = bio_check_copy_eod(bio, bio->bi_iter.bi_sector,
+                               copy_size, dst_max_sect);
+       if (unlikely(ret))
+               goto out;
+
+       bio->bi_iter.bi_sector += dst_start_sect;
+       return 0;
+err:
+       rcu_read_unlock();
+out:
+       return ret;
+}
+
 /*
  * Remap block n of partition p to block n+start(p) of the disk.
  */
@@ -799,13 +865,15 @@ static noinline_for_stack bool submit_bio_checks(struct 
bio *bio)
 
        if (should_fail_bio(bio))
                goto end_io;
-       if (unlikely(bio_check_ro(bio)))
-               goto end_io;
-       if (!bio_flagged(bio, BIO_REMAPPED)) {
-               if (unlikely(bio_check_eod(bio)))
-                       goto end_io;
-               if (bdev->bd_partno && unlikely(blk_partition_remap(bio)))
+       if (likely(!op_is_copy(bio->bi_opf))) {
+               if (unlikely(bio_check_ro(bio)))
                        goto end_io;
+               if (!bio_flagged(bio, BIO_REMAPPED)) {
+                       if (unlikely(bio_check_eod(bio)))
+                               goto end_io;
+                       if (bdev->bd_partno && 
unlikely(blk_partition_remap(bio)))
+                               goto end_io;
+               }
        }
 
        /*
@@ -829,6 +897,10 @@ static noinline_for_stack bool submit_bio_checks(struct 
bio *bio)
                if (!blk_queue_discard(q))
                        goto not_supported;
                break;
+       case REQ_OP_COPY:
+               if (unlikely(blk_check_copy(bio)))
+                       goto end_io;
+               break;
        case REQ_OP_SECURE_ERASE:
                if (!blk_queue_secure_erase(q))
                        goto not_supported;
diff --git a/block/blk-lib.c b/block/blk-lib.c
index 9f09beadcbe3..7fee0ae95c44 100644
--- a/block/blk-lib.c
+++ b/block/blk-lib.c
@@ -151,6 +151,258 @@ int blkdev_issue_discard(struct block_device *bdev, 
sector_t sector,
 }
 EXPORT_SYMBOL(blkdev_issue_discard);
 
+/*
+ * Wait on and process all in-flight BIOs.  This must only be called once
+ * all bios have been issued so that the refcount can only decrease.
+ * This just waits for all bios to make it through cio_bio_end_io.  IO
+ * errors are propagated through cio->io_error.
+ */
+static int cio_await_completion(struct cio *cio)
+{
+       int ret = 0;
+
+       while (atomic_read(&cio->refcount)) {
+               cio->waiter = current;
+               __set_current_state(TASK_UNINTERRUPTIBLE);
+               blk_io_schedule();
+               /* wake up sets us TASK_RUNNING */
+               cio->waiter = NULL;
+               ret = cio->io_err;
+       }
+       kvfree(cio);
+
+       return ret;
+}
+
+/*
+ * The BIO completion handler simply decrements refcount.
+ * Also wake up process, if this is the last bio to be completed.
+ *
+ * During I/O bi_private points at the cio.
+ */
+static void cio_bio_end_io(struct bio *bio)
+{
+       struct cio *cio = bio->bi_private;
+
+       if (bio->bi_status)
+               cio->io_err = bio->bi_status;
+       kvfree(page_address(bio_first_bvec_all(bio)->bv_page) +
+                       bio_first_bvec_all(bio)->bv_offset);
+       bio_put(bio);
+
+       if (atomic_dec_and_test(&cio->refcount) && cio->waiter)
+               wake_up_process(cio->waiter);
+}
+
+int blk_copy_offload_submit_bio(struct block_device *bdev,
+               struct blk_copy_payload *payload, int payload_size,
+               struct cio *cio, gfp_t gfp_mask)
+{
+       struct request_queue *q = bdev_get_queue(bdev);
+       struct bio *bio;
+
+       bio = bio_map_kern(q, payload, payload_size, gfp_mask);
+       if (IS_ERR(bio))
+               return PTR_ERR(bio);
+
+       bio_set_dev(bio, bdev);
+       bio->bi_opf = REQ_OP_COPY | REQ_NOMERGE;
+       bio->bi_iter.bi_sector = payload->dest;
+       bio->bi_end_io = cio_bio_end_io;
+       bio->bi_private = cio;
+       atomic_inc(&cio->refcount);
+       submit_bio(bio);
+
+       return 0;
+}
+
+/* Go through all the enrties inside user provided payload, and determine the
+ * maximum number of entries in a payload, based on device's scc-limits.
+ */
+static inline int blk_max_payload_entries(int nr_srcs, struct range_entry 
*rlist,
+               int max_nr_srcs, sector_t max_copy_range_sectors, sector_t 
max_copy_len)
+{
+       sector_t range_len, copy_len = 0, remaining = 0;
+       int ri = 0, pi = 1, max_pi = 0;
+
+       for (ri = 0; ri < nr_srcs; ri++) {
+               for (remaining = rlist[ri].len; remaining > 0; remaining -= 
range_len) {
+                       range_len = min3(remaining, max_copy_range_sectors,
+                                                               max_copy_len - 
copy_len);
+                       pi++;
+                       copy_len += range_len;
+
+                       if ((pi == max_nr_srcs) || (copy_len == max_copy_len)) {
+                               max_pi = max(max_pi, pi);
+                               pi = 1;
+                               copy_len = 0;
+                       }
+               }
+       }
+
+       return max(max_pi, pi);
+}
+
+/*
+ * blk_copy_offload_scc        - Use device's native copy offload feature
+ * Go through user provide payload, prepare new payload based on device's copy 
offload limits.
+ */
+int blk_copy_offload_scc(struct block_device *src_bdev, int nr_srcs,
+               struct range_entry *rlist, struct block_device *dest_bdev,
+               sector_t dest, gfp_t gfp_mask)
+{
+       struct request_queue *q = bdev_get_queue(dest_bdev);
+       struct cio *cio = NULL;
+       struct blk_copy_payload *payload;
+       sector_t range_len, copy_len = 0, remaining = 0;
+       sector_t src_blk, cdest = dest;
+       sector_t max_copy_range_sectors, max_copy_len;
+       int ri = 0, pi = 0, ret = 0, payload_size, max_pi, max_nr_srcs;
+
+       cio = kzalloc(sizeof(struct cio), GFP_KERNEL);
+       if (!cio)
+               return -ENOMEM;
+       atomic_set(&cio->refcount, 0);
+
+       max_nr_srcs = q->limits.max_copy_nr_ranges;
+       max_copy_range_sectors = q->limits.max_copy_range_sectors;
+       max_copy_len = q->limits.max_copy_sectors;
+
+       max_pi = blk_max_payload_entries(nr_srcs, rlist, max_nr_srcs,
+                                       max_copy_range_sectors, max_copy_len);
+       payload_size = struct_size(payload, range, max_pi);
+
+       payload = kvmalloc(payload_size, gfp_mask);
+       if (!payload) {
+               ret = -ENOMEM;
+               goto free_cio;
+       }
+       payload->src_bdev = src_bdev;
+
+       for (ri = 0; ri < nr_srcs; ri++) {
+               for (remaining = rlist[ri].len, src_blk = rlist[ri].src; 
remaining > 0;
+                                               remaining -= range_len, src_blk 
+= range_len) {
+
+                       range_len = min3(remaining, max_copy_range_sectors,
+                                                               max_copy_len - 
copy_len);
+                       payload->range[pi].len = range_len;
+                       payload->range[pi].src = src_blk;
+                       pi++;
+                       copy_len += range_len;
+
+                       /* Submit current payload, if crossing device copy 
limits */
+                       if ((pi == max_nr_srcs) || (copy_len == max_copy_len)) {
+                               payload->dest = cdest;
+                               payload->copy_nr_ranges = pi;
+                               ret = blk_copy_offload_submit_bio(dest_bdev, 
payload,
+                                                               payload_size, 
cio, gfp_mask);
+                               if (ret)
+                                       goto free_payload;
+
+                               /* reset index, length and allocate new payload 
*/
+                               pi = 0;
+                               cdest += copy_len;
+                               copy_len = 0;
+                               payload = kvmalloc(payload_size, gfp_mask);
+                               if (!payload) {
+                                       ret = -ENOMEM;
+                                       goto free_cio;
+                               }
+                               payload->src_bdev = src_bdev;
+                       }
+               }
+       }
+
+       if (pi) {
+               payload->dest = cdest;
+               payload->copy_nr_ranges = pi;
+               ret = blk_copy_offload_submit_bio(dest_bdev, payload, 
payload_size, cio, gfp_mask);
+               if (ret)
+                       goto free_payload;
+       }
+
+       /* Wait for completion of all IO's*/
+       ret = cio_await_completion(cio);
+
+       return ret;
+
+free_payload:
+       kvfree(payload);
+free_cio:
+       cio_await_completion(cio);
+       return ret;
+}
+
+static inline sector_t blk_copy_len(struct range_entry *rlist, int nr_srcs)
+{
+       int i;
+       sector_t len = 0;
+
+       for (i = 0; i < nr_srcs; i++) {
+               if (rlist[i].len)
+                       len += rlist[i].len;
+               else
+                       return 0;
+       }
+
+       return len;
+}
+
+static inline bool blk_check_offload_scc(struct request_queue *src_q,
+               struct request_queue *dest_q)
+{
+       if (src_q == dest_q && src_q->limits.copy_offload == 
BLK_COPY_OFFLOAD_SCC)
+               return true;
+
+       return false;
+}
+
+/*
+ * blkdev_issue_copy - queue a copy
+ * @src_bdev:  source block device
+ * @nr_srcs:   number of source ranges to copy
+ * @src_rlist: array of source ranges
+ * @dest_bdev: destination block device
+ * @dest:      destination in sector
+ * @gfp_mask:   memory allocation flags (for bio_alloc)
+ * @flags:     BLKDEV_COPY_* flags to control behaviour
+ *
+ * Description:
+ *     Copy source ranges from source block device to destination block device.
+ *     length of a source range cannot be zero.
+ */
+int blkdev_issue_copy(struct block_device *src_bdev, int nr_srcs,
+               struct range_entry *src_rlist, struct block_device *dest_bdev,
+               sector_t dest, gfp_t gfp_mask, int flags)
+{
+       struct request_queue *src_q = bdev_get_queue(src_bdev);
+       struct request_queue *dest_q = bdev_get_queue(dest_bdev);
+       sector_t copy_len;
+       int ret = -EINVAL;
+
+       if (!src_q || !dest_q)
+               return -ENXIO;
+
+       if (!nr_srcs)
+               return -EINVAL;
+
+       if (nr_srcs >= MAX_COPY_NR_RANGE)
+               return -EINVAL;
+
+       copy_len = blk_copy_len(src_rlist, nr_srcs);
+       if (!copy_len && copy_len >= MAX_COPY_TOTAL_LENGTH)
+               return -EINVAL;
+
+       if (bdev_read_only(dest_bdev))
+               return -EPERM;
+
+       if (blk_check_offload_scc(src_q, dest_q))
+               ret = blk_copy_offload_scc(src_bdev, nr_srcs, src_rlist, 
dest_bdev, dest, gfp_mask);
+
+       return ret;
+}
+EXPORT_SYMBOL(blkdev_issue_copy);
+
 /**
  * __blkdev_issue_write_same - generate number of bios with same page
  * @bdev:      target blockdev
diff --git a/block/blk-zoned.c b/block/blk-zoned.c
index 86fce751bb17..7643fc868521 100644
--- a/block/blk-zoned.c
+++ b/block/blk-zoned.c
@@ -67,6 +67,7 @@ bool blk_req_needs_zone_write_lock(struct request *rq)
        case REQ_OP_WRITE_ZEROES:
        case REQ_OP_WRITE_SAME:
        case REQ_OP_WRITE:
+       case REQ_OP_COPY:
                return blk_rq_zone_is_seq(rq);
        default:
                return false;
diff --git a/block/bounce.c b/block/bounce.c
index 05fc7148489d..d9b05aaf6e56 100644
--- a/block/bounce.c
+++ b/block/bounce.c
@@ -176,6 +176,7 @@ static struct bio *bounce_clone_bio(struct bio *bio_src)
        bio->bi_iter.bi_size    = bio_src->bi_iter.bi_size;
 
        switch (bio_op(bio)) {
+       case REQ_OP_COPY:
        case REQ_OP_DISCARD:
        case REQ_OP_SECURE_ERASE:
        case REQ_OP_WRITE_ZEROES:
diff --git a/include/linux/bio.h b/include/linux/bio.h
index 3d67d0fbc868..068fa2e8896a 100644
--- a/include/linux/bio.h
+++ b/include/linux/bio.h
@@ -73,6 +73,7 @@ static inline bool bio_has_data(struct bio *bio)
 static inline bool bio_no_advance_iter(const struct bio *bio)
 {
        return bio_op(bio) == REQ_OP_DISCARD ||
+              bio_op(bio) == REQ_OP_COPY ||
               bio_op(bio) == REQ_OP_SECURE_ERASE ||
               bio_op(bio) == REQ_OP_WRITE_SAME ||
               bio_op(bio) == REQ_OP_WRITE_ZEROES;
diff --git a/include/linux/blk_types.h b/include/linux/blk_types.h
index 9e392daa1d7f..1ab77176cb46 100644
--- a/include/linux/blk_types.h
+++ b/include/linux/blk_types.h
@@ -347,6 +347,8 @@ enum req_opf {
        REQ_OP_ZONE_RESET       = 15,
        /* reset all the zone present on the device */
        REQ_OP_ZONE_RESET_ALL   = 17,
+       /* copy ranges within device */
+       REQ_OP_COPY             = 19,
 
        /* Driver private requests */
        REQ_OP_DRV_IN           = 34,
@@ -470,6 +472,11 @@ static inline bool op_is_discard(unsigned int op)
        return (op & REQ_OP_MASK) == REQ_OP_DISCARD;
 }
 
+static inline bool op_is_copy(unsigned int op)
+{
+       return (op & REQ_OP_MASK) == REQ_OP_COPY;
+}
+
 /*
  * Check if a bio or request operation is a zone management operation, with
  * the exception of REQ_OP_ZONE_RESET_ALL which is treated as a special case
@@ -529,4 +536,17 @@ struct blk_rq_stat {
        u64 batch;
 };
 
+struct cio {
+       atomic_t refcount;
+       blk_status_t io_err;
+       struct task_struct *waiter;     /* waiting task (NULL if none) */
+};
+
+struct blk_copy_payload {
+       struct block_device     *src_bdev;
+       sector_t                dest;
+       int                     copy_nr_ranges;
+       struct range_entry      range[];
+};
+
 #endif /* __LINUX_BLK_TYPES_H */
diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
index fd4cfaadda5b..38369dff6a36 100644
--- a/include/linux/blkdev.h
+++ b/include/linux/blkdev.h
@@ -52,6 +52,12 @@ struct blk_keyslot_manager;
 /* Doing classic polling */
 #define BLK_MQ_POLL_CLASSIC -1
 
+/* Define copy offload options */
+enum blk_copy {
+       BLK_COPY_OFFLOAD_EMULATE = 0,
+       BLK_COPY_OFFLOAD_SCC,
+};
+
 /*
  * Maximum number of blkcg policies allowed to be registered concurrently.
  * Defined here to simplify include dependency.
@@ -1051,6 +1057,9 @@ static inline unsigned int 
blk_queue_get_max_sectors(struct request_queue *q,
                return min(q->limits.max_discard_sectors,
                           UINT_MAX >> SECTOR_SHIFT);
 
+       if (unlikely(op == REQ_OP_COPY))
+               return q->limits.max_copy_sectors;
+
        if (unlikely(op == REQ_OP_WRITE_SAME))
                return q->limits.max_write_same_sectors;
 
@@ -1326,6 +1335,10 @@ extern int __blkdev_issue_discard(struct block_device 
*bdev, sector_t sector,
                sector_t nr_sects, gfp_t gfp_mask, int flags,
                struct bio **biop);
 
+int blkdev_issue_copy(struct block_device *src_bdev, int nr_srcs,
+               struct range_entry *src_rlist, struct block_device *dest_bdev,
+               sector_t dest, gfp_t gfp_mask, int flags);
+
 #define BLKDEV_ZERO_NOUNMAP    (1 << 0)  /* do not free blocks */
 #define BLKDEV_ZERO_NOFALLBACK (1 << 1)  /* don't write explicit zeroes */
 
diff --git a/include/uapi/linux/fs.h b/include/uapi/linux/fs.h
index bdf7b404b3e7..7a97b588d892 100644
--- a/include/uapi/linux/fs.h
+++ b/include/uapi/linux/fs.h
@@ -64,6 +64,18 @@ struct fstrim_range {
        __u64 minlen;
 };
 
+/* Maximum no of entries supported */
+#define MAX_COPY_NR_RANGE      (1 << 12)
+
+/* maximum total copy length */
+#define MAX_COPY_TOTAL_LENGTH  (1 << 21)
+
+/* Source range entry for copy */
+struct range_entry {
+       __u64 src;
+       __u64 len;
+};
+
 /* extent-same (dedupe) ioctls; these MUST match the btrfs ioctl definitions */
 #define FILE_DEDUPE_RANGE_SAME         0
 #define FILE_DEDUPE_RANGE_DIFFERS      1
-- 
2.25.1


--
dm-devel mailing list
[email protected]
https://listman.redhat.com/mailman/listinfo/dm-devel

Reply via email to