Add generic copy offload support to the block layer.

We add two new bio types: REQ_OP_COPY_READ_TOKEN and
REQ_OP_COPY_WRITE_TOKEN. Their bio vector has one entry - a page
containing the token.

When we need to copy data, we send REQ_OP_COPY_READ_TOKEN to the source
device and then we send REQ_OP_COPY_WRITE_TOKEN to the destination device.

This patch introduces a new ioctl BLKCOPY that submits the copy operation.
BLKCOPY argument has four 64-bit numbers - source offset, destination
offset and length. The last number is returned by the ioctl and it is the
number of bytes that were actually copied.

For in-kernel users, we introduce a function blkdev_issue_copy.

Copying may fail anytime, the caller is required to fallback to explicit
copy.

Signed-off-by: Mikulas Patocka <[email protected]>

---
 block/blk-core.c          |    7 +++
 block/blk-lib.c           |   89 ++++++++++++++++++++++++++++++++++++++++++++++
 block/blk-settings.c      |   12 ++++++
 block/blk-sysfs.c         |    7 +++
 block/blk.h               |    3 +
 block/ioctl.c             |   56 ++++++++++++++++++++++++++++
 include/linux/blk_types.h |    4 ++
 include/linux/blkdev.h    |   18 +++++++++
 include/uapi/linux/fs.h   |    1 
 9 files changed, 197 insertions(+)

Index: linux-2.6/block/blk-settings.c
===================================================================
--- linux-2.6.orig/block/blk-settings.c 2022-01-26 19:12:30.000000000 +0100
+++ linux-2.6/block/blk-settings.c      2022-01-27 20:43:27.000000000 +0100
@@ -57,6 +57,7 @@ void blk_set_default_limits(struct queue
        lim->misaligned = 0;
        lim->zoned = BLK_ZONED_NONE;
        lim->zone_write_granularity = 0;
+       lim->max_copy_sectors = 0;
 }
 EXPORT_SYMBOL(blk_set_default_limits);
 
@@ -365,6 +366,17 @@ void blk_queue_zone_write_granularity(st
 EXPORT_SYMBOL_GPL(blk_queue_zone_write_granularity);
 
 /**
+ * blk_queue_max_copy_sectors - set maximum copy offload sectors for the queue
+ * @q:  the request queue for the device
+ * @size:  the maximum copy offload sectors
+ */
+void blk_queue_max_copy_sectors(struct request_queue *q, unsigned int size)
+{
+       q->limits.max_copy_sectors = size;
+}
+EXPORT_SYMBOL_GPL(blk_queue_max_copy_sectors);
+
+/**
  * blk_queue_alignment_offset - set physical block alignment offset
  * @q: the request queue for the device
  * @offset: alignment offset in bytes
Index: linux-2.6/include/linux/blkdev.h
===================================================================
--- linux-2.6.orig/include/linux/blkdev.h       2022-01-26 19:12:30.000000000 
+0100
+++ linux-2.6/include/linux/blkdev.h    2022-01-29 17:46:03.000000000 +0100
@@ -103,6 +103,7 @@ struct queue_limits {
        unsigned int            discard_granularity;
        unsigned int            discard_alignment;
        unsigned int            zone_write_granularity;
+       unsigned int            max_copy_sectors;
 
        unsigned short          max_segments;
        unsigned short          max_integrity_segments;
@@ -706,6 +707,7 @@ extern void blk_queue_max_zone_append_se
 extern void blk_queue_physical_block_size(struct request_queue *, unsigned 
int);
 void blk_queue_zone_write_granularity(struct request_queue *q,
                                      unsigned int size);
+void blk_queue_max_copy_sectors(struct request_queue *q, unsigned int size);
 extern void blk_queue_alignment_offset(struct request_queue *q,
                                       unsigned int alignment);
 void disk_update_readahead(struct gendisk *disk);
@@ -862,6 +864,10 @@ extern int __blkdev_issue_zeroout(struct
 extern int blkdev_issue_zeroout(struct block_device *bdev, sector_t sector,
                sector_t nr_sects, gfp_t gfp_mask, unsigned flags);
 
+extern int blkdev_issue_copy(struct block_device *bdev1, sector_t sector1,
+                     struct block_device *bdev2, sector_t sector2,
+                     sector_t nr_sects, sector_t *copied, gfp_t gfp_mask);
+
 static inline int sb_issue_discard(struct super_block *sb, sector_t block,
                sector_t nr_blocks, gfp_t gfp_mask, unsigned long flags)
 {
@@ -1001,6 +1007,18 @@ bdev_zone_write_granularity(struct block
        return queue_zone_write_granularity(bdev_get_queue(bdev));
 }
 
+static inline unsigned int
+queue_max_copy_sectors(const struct request_queue *q)
+{
+       return q->limits.max_copy_sectors;
+}
+
+static inline unsigned int
+bdev_max_copy_sectors(struct block_device *bdev)
+{
+       return queue_max_copy_sectors(bdev_get_queue(bdev));
+}
+
 static inline int queue_alignment_offset(const struct request_queue *q)
 {
        if (q->limits.misaligned)
Index: linux-2.6/block/blk-sysfs.c
===================================================================
--- linux-2.6.orig/block/blk-sysfs.c    2022-01-26 19:12:30.000000000 +0100
+++ linux-2.6/block/blk-sysfs.c 2022-01-26 19:12:30.000000000 +0100
@@ -230,6 +230,11 @@ static ssize_t queue_zone_write_granular
        return queue_var_show(queue_zone_write_granularity(q), page);
 }
 
+static ssize_t queue_max_copy_sectors_show(struct request_queue *q, char *page)
+{
+       return queue_var_show(queue_max_copy_sectors(q), page);
+}
+
 static ssize_t queue_zone_append_max_show(struct request_queue *q, char *page)
 {
        unsigned long long max_sectors = q->limits.max_zone_append_sectors;
@@ -591,6 +596,7 @@ QUEUE_RO_ENTRY(queue_write_same_max, "wr
 QUEUE_RO_ENTRY(queue_write_zeroes_max, "write_zeroes_max_bytes");
 QUEUE_RO_ENTRY(queue_zone_append_max, "zone_append_max_bytes");
 QUEUE_RO_ENTRY(queue_zone_write_granularity, "zone_write_granularity");
+QUEUE_RO_ENTRY(queue_max_copy_sectors, "max_copy_sectors");
 
 QUEUE_RO_ENTRY(queue_zoned, "zoned");
 QUEUE_RO_ENTRY(queue_nr_zones, "nr_zones");
@@ -647,6 +653,7 @@ static struct attribute *queue_attrs[] =
        &queue_write_zeroes_max_entry.attr,
        &queue_zone_append_max_entry.attr,
        &queue_zone_write_granularity_entry.attr,
+       &queue_max_copy_sectors_entry.attr,
        &queue_nonrot_entry.attr,
        &queue_zoned_entry.attr,
        &queue_nr_zones_entry.attr,
Index: linux-2.6/include/linux/blk_types.h
===================================================================
--- linux-2.6.orig/include/linux/blk_types.h    2022-01-06 18:55:01.000000000 
+0100
+++ linux-2.6/include/linux/blk_types.h 2022-01-29 17:47:44.000000000 +0100
@@ -371,6 +371,10 @@ enum req_opf {
        /* reset all the zone present on the device */
        REQ_OP_ZONE_RESET_ALL   = 17,
 
+       /* copy offload bios */
+       REQ_OP_COPY_READ_TOKEN  = 18,
+       REQ_OP_COPY_WRITE_TOKEN = 19,
+
        /* Driver private requests */
        REQ_OP_DRV_IN           = 34,
        REQ_OP_DRV_OUT          = 35,
Index: linux-2.6/block/blk-lib.c
===================================================================
--- linux-2.6.orig/block/blk-lib.c      2021-08-18 13:59:55.000000000 +0200
+++ linux-2.6/block/blk-lib.c   2022-01-30 17:33:04.000000000 +0100
@@ -440,3 +440,92 @@ retry:
        return ret;
 }
 EXPORT_SYMBOL(blkdev_issue_zeroout);
+
+static void bio_wake_completion(struct bio *bio)
+{
+       struct completion *comp = bio->bi_private;
+       complete(comp);
+}
+
+int blkdev_issue_copy(struct block_device *bdev1, sector_t sector1,
+                     struct block_device *bdev2, sector_t sector2,
+                     sector_t nr_sects, sector_t *copied, gfp_t gfp_mask)
+{
+       struct page *token;
+       sector_t m;
+       int r = 0;
+       struct completion comp;
+
+       *copied = 0;
+
+       m = min(bdev_max_copy_sectors(bdev1), bdev_max_copy_sectors(bdev2));
+       if (!m)
+               return -EOPNOTSUPP;
+       m = min(m, (sector_t)round_down(UINT_MAX, PAGE_SIZE) >> 9);
+
+       if (unlikely(bdev_read_only(bdev2)))
+               return -EPERM;
+
+       token = alloc_page(gfp_mask);
+       if (unlikely(!token))
+               return -ENOMEM;
+
+       while (nr_sects) {
+               struct bio *read_bio, *write_bio;
+               sector_t this_step = min(nr_sects, m);
+
+               read_bio = bio_alloc(gfp_mask, 1);
+               if (unlikely(!read_bio)) {
+                       r = -ENOMEM;
+                       break;
+               }
+               bio_set_op_attrs(read_bio, REQ_OP_COPY_READ_TOKEN, REQ_NOMERGE);
+               bio_set_dev(read_bio, bdev1);
+               __bio_add_page(read_bio, token, PAGE_SIZE, 0);
+               read_bio->bi_iter.bi_sector = sector1;
+               read_bio->bi_iter.bi_size = this_step << 9;
+               read_bio->bi_private = &comp;
+               read_bio->bi_end_io = bio_wake_completion;
+               init_completion(&comp);
+               submit_bio(read_bio);
+               wait_for_completion(&comp);
+               if (unlikely(read_bio->bi_status != BLK_STS_OK)) {
+                       r = blk_status_to_errno(read_bio->bi_status);
+                       bio_put(read_bio);
+                       break;
+               }
+               bio_put(read_bio);
+
+               write_bio = bio_alloc(gfp_mask, 1);
+               if (unlikely(!write_bio)) {
+                       r = -ENOMEM;
+                       break;
+               }
+               bio_set_op_attrs(write_bio, REQ_OP_COPY_WRITE_TOKEN, 
REQ_NOMERGE);
+               bio_set_dev(write_bio, bdev2);
+               __bio_add_page(write_bio, token, PAGE_SIZE, 0);
+               write_bio->bi_iter.bi_sector = sector2;
+               write_bio->bi_iter.bi_size = this_step << 9;
+               write_bio->bi_private = &comp;
+               write_bio->bi_end_io = bio_wake_completion;
+               reinit_completion(&comp);
+               submit_bio(write_bio);
+               wait_for_completion(&comp);
+               if (unlikely(write_bio->bi_status != BLK_STS_OK)) {
+                       r = blk_status_to_errno(write_bio->bi_status);
+                       bio_put(write_bio);
+                       break;
+               }
+               bio_put(write_bio);
+
+               sector1 += this_step;
+               sector2 += this_step;
+               nr_sects -= this_step;
+               *copied += this_step;
+       }
+
+       __free_page(token);
+
+       return r;
+}
+EXPORT_SYMBOL(blkdev_issue_copy);
Index: linux-2.6/block/ioctl.c
===================================================================
--- linux-2.6.orig/block/ioctl.c        2022-01-24 15:10:40.000000000 +0100
+++ linux-2.6/block/ioctl.c     2022-01-30 13:43:35.000000000 +0100
@@ -165,6 +165,60 @@ fail:
        return err;
 }
 
+static int blk_ioctl_copy(struct block_device *bdev, fmode_t mode,
+               unsigned long arg)
+{
+       uint64_t range[4];
+       uint64_t start1, start2, end1, end2, len;
+       sector_t copied = 0;
+       struct inode *inode = bdev->bd_inode;
+       int err;
+
+       if (!(mode & FMODE_WRITE)) {
+               err = -EBADF;
+               goto fail1;
+       }
+
+       if (copy_from_user(range, (void __user *)arg, 24)) {
+               err = -EFAULT;
+               goto fail1;
+       }
+
+       start1 = range[0];
+       start2 = range[1];
+       len = range[2];
+       end1 = start1 + len - 1;
+       end2 = start2 + len - 1;
+
+       if ((start1 | start2 | len) & 511)
+               return -EINVAL;
+       if (end1 >= (uint64_t)bdev_nr_bytes(bdev))
+               return -EINVAL;
+       if (end2 >= (uint64_t)bdev_nr_bytes(bdev))
+               return -EINVAL;
+       if (end1 < start1)
+               return -EINVAL;
+       if (end2 < start2)
+               return -EINVAL;
+
+       filemap_invalidate_lock(inode->i_mapping);
+       err = truncate_bdev_range(bdev, mode, start2, end2);
+       if (err)
+               goto fail2;
+
+       err = blkdev_issue_copy(bdev, start1 >> 9, bdev, start2 >> 9, len >> 9, 
&copied, GFP_KERNEL);
+
+fail2:
+       filemap_invalidate_unlock(inode->i_mapping);
+
+fail1:
+       range[3] = (uint64_t)copied << 9;
+       if (copy_to_user((void __user *)(arg + 24), &range[3], 8))
+               err = -EFAULT;
+
+       return err;
+}
+
 static int put_ushort(unsigned short __user *argp, unsigned short val)
 {
        return put_user(val, argp);
@@ -459,6 +513,8 @@ static int blkdev_common_ioctl(struct bl
                return blk_ioctl_zeroout(bdev, mode, arg);
        case BLKGETDISKSEQ:
                return put_u64(argp, bdev->bd_disk->diskseq);
+       case BLKCOPY:
+               return blk_ioctl_copy(bdev, mode, arg);
        case BLKREPORTZONE:
                return blkdev_report_zones_ioctl(bdev, mode, cmd, arg);
        case BLKRESETZONE:
Index: linux-2.6/include/uapi/linux/fs.h
===================================================================
--- linux-2.6.orig/include/uapi/linux/fs.h      2021-09-23 17:07:02.000000000 
+0200
+++ linux-2.6/include/uapi/linux/fs.h   2022-01-27 19:05:46.000000000 +0100
@@ -185,6 +185,7 @@ struct fsxattr {
 #define BLKROTATIONAL _IO(0x12,126)
 #define BLKZEROOUT _IO(0x12,127)
 #define BLKGETDISKSEQ _IOR(0x12,128,__u64)
+#define BLKCOPY _IO(0x12,129)
 /*
  * A jump here: 130-136 are reserved for zoned block devices
  * (see uapi/linux/blkzoned.h)
Index: linux-2.6/block/blk.h
===================================================================
--- linux-2.6.orig/block/blk.h  2022-01-24 15:10:40.000000000 +0100
+++ linux-2.6/block/blk.h       2022-01-29 18:10:28.000000000 +0100
@@ -288,6 +288,9 @@ static inline bool blk_may_split(struct
        case REQ_OP_WRITE_ZEROES:
        case REQ_OP_WRITE_SAME:
                return true; /* non-trivial splitting decisions */
+       case REQ_OP_COPY_READ_TOKEN:
+       case REQ_OP_COPY_WRITE_TOKEN:
+               return false;
        default:
                break;
        }
Index: linux-2.6/block/blk-core.c
===================================================================
--- linux-2.6.orig/block/blk-core.c     2022-01-24 15:10:40.000000000 +0100
+++ linux-2.6/block/blk-core.c  2022-02-01 15:53:39.000000000 +0100
@@ -124,6 +124,8 @@ static const char *const blk_op_name[] =
        REQ_OP_NAME(ZONE_APPEND),
        REQ_OP_NAME(WRITE_SAME),
        REQ_OP_NAME(WRITE_ZEROES),
+       REQ_OP_NAME(COPY_READ_TOKEN),
+       REQ_OP_NAME(COPY_WRITE_TOKEN),
        REQ_OP_NAME(DRV_IN),
        REQ_OP_NAME(DRV_OUT),
 };
@@ -758,6 +760,11 @@ noinline_for_stack bool submit_bio_check
                if (!q->limits.max_write_zeroes_sectors)
                        goto not_supported;
                break;
+       case REQ_OP_COPY_READ_TOKEN:
+       case REQ_OP_COPY_WRITE_TOKEN:
+               if (!q->limits.max_copy_sectors)
+                       goto not_supported;
+               break;
        default:
                break;
        }

--
dm-devel mailing list
[email protected]
https://listman.redhat.com/mailman/listinfo/dm-devel

Reply via email to