The 02/15/2021 19:08, Mike Snitzer wrote:
> On Mon, Feb 15 2021 at  5:34am -0500,
> Sergei Shtepa <[email protected]> wrote:
> 
> > The 02/12/2021 19:06, Mike Snitzer wrote:
> > > On Fri, Feb 12 2021 at  6:34am -0500,
> > > Sergei Shtepa <[email protected]> wrote:
> > > 
> > > > The 02/11/2021 20:51, Mike Snitzer wrote:
> > > > > On Tue, Feb 09 2021 at  9:30am -0500,
> > > > > Sergei Shtepa <[email protected]> wrote:
> > > > > 
> > > > > > The 'noexcl' option allow to open underlying block-device
> > > > > > without FMODE_EXCL.
> > > > > > 
> > > > > > Signed-off-by: Sergei Shtepa <[email protected]>
> > > > > > ---
> > > > > >  drivers/md/dm-linear.c        | 14 +++++++++++++-
> > > > > >  drivers/md/dm-table.c         | 14 ++++++++------
> > > > > >  drivers/md/dm.c               | 26 +++++++++++++++++++-------
> > > > > >  drivers/md/dm.h               |  2 +-
> > > > > >  include/linux/device-mapper.h |  7 +++++++
> > > > > >  5 files changed, 48 insertions(+), 15 deletions(-)
> > > > > > 
> > > > > > diff --git a/drivers/md/dm-linear.c b/drivers/md/dm-linear.c
> > > > > > index 00774b5d7668..b16d89802b9d 100644
> > > > > > --- a/drivers/md/dm-linear.c
> > > > > > +++ b/drivers/md/dm-linear.c
> > > > > > @@ -33,7 +33,7 @@ static int linear_ctr(struct dm_target *ti, 
> > > > > > unsigned int argc, char **argv)
> > > > > >     char dummy;
> > > > > >     int ret;
> > > > > >  
> > > > > > -   if (argc != 2) {
> > > > > > +   if ((argc < 2) || (argc > 3)) {
> > > > > >             ti->error = "Invalid argument count";
> > > > > >             return -EINVAL;
> > > > > >     }
> > > > > > @@ -51,6 +51,18 @@ static int linear_ctr(struct dm_target *ti, 
> > > > > > unsigned int argc, char **argv)
> > > > > >     }
> > > > > >     lc->start = tmp;
> > > > > >  
> > > > > > +   ti->non_exclusive = false;
> > > > > > +   if (argc > 2) {
> > > > > > +           if (strcmp("noexcl", argv[2]) == 0)
> > > > > > +                   ti->non_exclusive = true;
> > > > > > +           else if (strcmp("excl", argv[2]) == 0)
> > > > > > +                   ti->non_exclusive = false;
> > > > > > +           else {
> > > > > > +                   ti->error = "Invalid exclusive option";
> > > > > > +                   return -EINVAL;
> > > > > > +           }
> > > > > > +   }
> > > > > > +
> > > > > >     ret = dm_get_device(ti, argv[0], dm_table_get_mode(ti->table), 
> > > > > > &lc->dev);
> > > > > >     if (ret) {
> > > > > >             ti->error = "Device lookup failed";
> > > > > > diff --git a/drivers/md/dm-table.c b/drivers/md/dm-table.c
> > > > > > index 4acf2342f7ad..f020459465bd 100644
> > > > > > --- a/drivers/md/dm-table.c
> > > > > > +++ b/drivers/md/dm-table.c
> > > > > > @@ -322,7 +322,7 @@ static int device_area_is_invalid(struct 
> > > > > > dm_target *ti, struct dm_dev *dev,
> > > > > >   * device and not to touch the existing bdev field in case
> > > > > >   * it is accessed concurrently.
> > > > > >   */
> > > > > > -static int upgrade_mode(struct dm_dev_internal *dd, fmode_t 
> > > > > > new_mode,
> > > > > > +static int upgrade_mode(struct dm_dev_internal *dd, fmode_t 
> > > > > > new_mode, bool non_exclusive,
> > > > > >                     struct mapped_device *md)
> > > > > >  {
> > > > > >     int r;
> > > > > > @@ -330,8 +330,8 @@ static int upgrade_mode(struct dm_dev_internal 
> > > > > > *dd, fmode_t new_mode,
> > > > > >  
> > > > > >     old_dev = dd->dm_dev;
> > > > > >  
> > > > > > -   r = dm_get_table_device(md, dd->dm_dev->bdev->bd_dev,
> > > > > > -                           dd->dm_dev->mode | new_mode, &new_dev);
> > > > > > +   r = dm_get_table_device(md, dd->dm_dev->bdev->bd_dev, 
> > > > > > dd->dm_dev->mode | new_mode,
> > > > > > +                           non_exclusive, &new_dev);
> > > > > >     if (r)
> > > > > >             return r;
> > > > > >  
> > > > > > @@ -387,7 +387,8 @@ int dm_get_device(struct dm_target *ti, const 
> > > > > > char *path, fmode_t mode,
> > > > > >             if (!dd)
> > > > > >                     return -ENOMEM;
> > > > > >  
> > > > > > -           if ((r = dm_get_table_device(t->md, dev, mode, 
> > > > > > &dd->dm_dev))) {
> > > > > > +           r = dm_get_table_device(t->md, dev, mode, 
> > > > > > ti->non_exclusive, &dd->dm_dev);
> > > > > > +           if (r) {
> > > > > >                     kfree(dd);
> > > > > >                     return r;
> > > > > >             }
> > > > > > @@ -396,8 +397,9 @@ int dm_get_device(struct dm_target *ti, const 
> > > > > > char *path, fmode_t mode,
> > > > > >             list_add(&dd->list, &t->devices);
> > > > > >             goto out;
> > > > > >  
> > > > > > -   } else if (dd->dm_dev->mode != (mode | dd->dm_dev->mode)) {
> > > > > > -           r = upgrade_mode(dd, mode, t->md);
> > > > > > +   } else if ((dd->dm_dev->mode != (mode | dd->dm_dev->mode)) &&
> > > > > > +              (dd->dm_dev->non_exclusive != ti->non_exclusive)) {
> > > > > > +           r = upgrade_mode(dd, mode, ti->non_exclusive, t->md);
> > > > > >             if (r)
> > > > > >                     return r;
> > > > > >     }
> > > > > > diff --git a/drivers/md/dm.c b/drivers/md/dm.c
> > > > > > index 00c41aa6d092..c25dcc2fdb89 100644
> > > > > > --- a/drivers/md/dm.c
> > > > > > +++ b/drivers/md/dm.c
> > > > > > @@ -1117,33 +1117,44 @@ static void close_table_device(struct 
> > > > > > table_device *td, struct mapped_device *md
> > > > > >     if (!td->dm_dev.bdev)
> > > > > >             return;
> > > > > >  
> > > > > > -   bd_unlink_disk_holder(td->dm_dev.bdev, dm_disk(md));
> > > > > > -   blkdev_put(td->dm_dev.bdev, td->dm_dev.mode | FMODE_EXCL);
> > > > > > +   if (td->dm_dev.non_exclusive)
> > > > > > +           blkdev_put(td->dm_dev.bdev, td->dm_dev.mode);
> > > > > > +   else {
> > > > > > +           bd_unlink_disk_holder(td->dm_dev.bdev, dm_disk(md));
> > > > > > +           blkdev_put(td->dm_dev.bdev, td->dm_dev.mode | 
> > > > > > FMODE_EXCL);
> > > > > > +   }
> > > > > > +
> > > > > > +
> > > > > > +   blkdev_put(td->dm_dev.bdev, td->dm_dev.mode);
> > > > > > +
> > > > > >     put_dax(td->dm_dev.dax_dev);
> > > > > >     td->dm_dev.bdev = NULL;
> > > > > >     td->dm_dev.dax_dev = NULL;
> > > > > > +   td->dm_dev.non_exclusive = false;
> > > > > >  }
> > > > > >  
> > > > > >  static struct table_device *find_table_device(struct list_head *l, 
> > > > > > dev_t dev,
> > > > > > -                                         fmode_t mode)
> > > > > > +                                         fmode_t mode, bool 
> > > > > > non_exclusive)
> > > > > >  {
> > > > > >     struct table_device *td;
> > > > > >  
> > > > > >     list_for_each_entry(td, l, list)
> > > > > > -           if (td->dm_dev.bdev->bd_dev == dev && td->dm_dev.mode 
> > > > > > == mode)
> > > > > > +           if (td->dm_dev.bdev->bd_dev == dev &&
> > > > > > +               td->dm_dev.mode == mode &&
> > > > > > +               td->dm_dev.non_exclusive == non_exclusive)
> > > > > >                     return td;
> > > > > >  
> > > > > >     return NULL;
> > > > > >  }
> > > > > >  
> > > > > > -int dm_get_table_device(struct mapped_device *md, dev_t dev, 
> > > > > > fmode_t mode,
> > > > > > +int dm_get_table_device(struct mapped_device *md, dev_t dev, 
> > > > > > fmode_t mode, bool non_exclusive,
> > > > > >                     struct dm_dev **result)
> > > > > >  {
> > > > > >     int r;
> > > > > >     struct table_device *td;
> > > > > >  
> > > > > >     mutex_lock(&md->table_devices_lock);
> > > > > > -   td = find_table_device(&md->table_devices, dev, mode);
> > > > > > +   td = find_table_device(&md->table_devices, dev, mode, 
> > > > > > non_exclusive);
> > > > > >     if (!td) {
> > > > > >             td = kmalloc_node(sizeof(*td), GFP_KERNEL, 
> > > > > > md->numa_node_id);
> > > > > >             if (!td) {
> > > > > > @@ -1154,7 +1165,8 @@ int dm_get_table_device(struct mapped_device 
> > > > > > *md, dev_t dev, fmode_t mode,
> > > > > >             td->dm_dev.mode = mode;
> > > > > >             td->dm_dev.bdev = NULL;
> > > > > >  
> > > > > > -           if ((r = open_table_device(td, dev, md))) {
> > > > > > +           r = open_table_device(td, dev, md, non_exclusive);
> > > > > > +           if (r) {
> > > > > >                     mutex_unlock(&md->table_devices_lock);
> > > > > >                     kfree(td);
> > > > > >                     return r;
> > > > > > diff --git a/drivers/md/dm.h b/drivers/md/dm.h
> > > > > > index fffe1e289c53..7bf20fb2de74 100644
> > > > > > --- a/drivers/md/dm.h
> > > > > > +++ b/drivers/md/dm.h
> > > > > > @@ -179,7 +179,7 @@ int dm_open_count(struct mapped_device *md);
> > > > > >  int dm_lock_for_deletion(struct mapped_device *md, bool 
> > > > > > mark_deferred, bool only_deferred);
> > > > > >  int dm_cancel_deferred_remove(struct mapped_device *md);
> > > > > >  int dm_request_based(struct mapped_device *md);
> > > > > > -int dm_get_table_device(struct mapped_device *md, dev_t dev, 
> > > > > > fmode_t mode,
> > > > > > +int dm_get_table_device(struct mapped_device *md, dev_t dev, 
> > > > > > fmode_t mode, bool non_exclusive,
> > > > > >                     struct dm_dev **result);
> > > > > >  void dm_put_table_device(struct mapped_device *md, struct dm_dev 
> > > > > > *d);
> > > > > >  
> > > > > > diff --git a/include/linux/device-mapper.h 
> > > > > > b/include/linux/device-mapper.h
> > > > > > index 61a66fb8ebb3..70002363bfc0 100644
> > > > > > --- a/include/linux/device-mapper.h
> > > > > > +++ b/include/linux/device-mapper.h
> > > > > > @@ -150,6 +150,7 @@ struct dm_dev {
> > > > > >     struct block_device *bdev;
> > > > > >     struct dax_device *dax_dev;
> > > > > >     fmode_t mode;
> > > > > > +   bool non_exclusive;
> > > > > >     char name[16];
> > > > > >  };
> > > > > >  
> > > > > > @@ -325,6 +326,12 @@ struct dm_target {
> > > > > >      * whether or not its underlying devices have support.
> > > > > >      */
> > > > > >     bool discards_supported:1;
> > > > > > +
> > > > > > +   /*
> > > > > > +    * Set if this target needs to open device without FMODE_EXCL
> > > > > > +    * mode.
> > > > > > +    */
> > > > > > +   bool non_exclusive:1;
> > > > > >  };
> > > > > >  
> > > > > >  void *dm_per_bio_data(struct bio *bio, size_t data_size);
> > > > > > -- 
> > > > > > 2.20.1
> > > > > > 
> > > > > 
> > > > > I'm really not liking this tug-of-war about FMODE_EXCL vs not.
> > > > > Especially dislike the prospect of needing to change _every_ DM target
> > > > > that would be made to support blk_interposer.
> > > > > 
> > > > > I've said this before, private or otherwise, but: Hannes' approach 
> > > > > that
> > > > > fell back to opening without FMODE_EXCL if FMODE_EXCL open failed.  
> > > > > Have
> > > > > you explored that kind of approach?
> > > > 
> > > > Of course I explored that kind of approach. The easiest thing to do
> > > > is fell back to opening without FMODE_EXCL if FMODE_EXCL open failed.
> > > > 
> > > > But I remind you once again that in this case, without changing
> > > > the code of each target, we will change the behavior of each.
> > > > Any target will open the device without the FMODE_EXCL flag if the 
> > > > device
> > > > is already busy. This can cause errors and cause data loss.
> > > > I would not want the device mapper to get worse when adding new 
> > > > functionality.
> > > 
> > > Right, but I'm not talking about a blind fallback that strips FMODE_EXCL
> > > if FMODE_EXCL open failed.
> > >  
> > > > I will do so in the next patch, as you are sure that it is better... Or
> > > > I'll think about it again and try to suggest a better implementation.
> > > > 
> > > > Thank you, Mike.
> > > > 
> > > > > 
> > > > > You _should_ be able to infer that interposer is being used given the
> > > > > requirement to use an explicit remap ioctl to establish the use of
> > > > > interposer.
> > > 
> > > I'm suggesting that open_table_device and close_table_device be made
> > > aware of the fact that they are operating on behalf of your remap ioctl
> > > (interpose).  So store state in the mapped_device that reflects a remap
> > > was used.
> > > 
> > > Still clunky but at least it confines it to an implementation detail
> > > managed by DM core rather than imposing awkward interface changes in
> > > both DM core and the DM targets.
> > > 
> > > Mike
> > > 
> > 
> > Based on your requirements, I conclude that the knowledge about the use
> > of interposer should be passed when creating target, since this is where
> > the open_table_device function is called.
> > This means that the 'noexcl' parameter is no longer needed, but will be
> > replaced with 'interposer'.
> > The ioctl is no longer needed, the target already knows that it works
> > through the interposer, and we can attach it already when creating
> > the target.
> > 
> > I like this logic, and I will implement it.
> 
> Yes, I never understood why a new ioctl was introduced.  But please be
> aware that this should _not_ be implemented in terms of each DM target
> needing to handle 'interposer' being passed as a text arg to the .ctr().
> 
> It should be an optional DM_INTERPOSER_FLAG added to DM_DEV_CREATE_CMD
> (much like optional DM_NOFLUSH_FLAG can be used with DM_DEV_SUSPEND_CMD).
> 
> Mike
> 

Hello, Mike.

I tried to do as you specified and I think I succeeded.
Before sending the next version of the patch, I would like to suggest
that you see if I am moving in the right direction.

I _attach_ the patches for the DM to the email.

The first one is nothing particularly new, I just put the code for
blk_interposer in the dm-interposer.c/.h file.

In the second one, I implemented the DM_INTERPOSED_DEV_FLAG flag.
No one of the targets were changed.

I haven't run this code yet. To do this, I need to further refine
dmsetup. This is in progress. But I think the code looks good.

-- 
Sergei Shtepa
Veeam Software developer.
>From f23ea4e073e263b6de2a0c08e7df965c26f1b3db Mon Sep 17 00:00:00 2001
From: Sergei Shtepa <[email protected]>
Date: Tue, 23 Feb 2021 13:39:26 +0100
Subject: [PATCH 4/5] dm: introduce dm-interposer

dm-interposer.c/. h contains code for working with blk_interposer
and provides an API for interposer in device-mapper.

Signed-off-by: Sergei Shtepa <[email protected]>
---
 drivers/md/Makefile        |   2 +-
 drivers/md/dm-interposer.c | 250 +++++++++++++++++++++++++++++++++++++
 drivers/md/dm-interposer.h |  40 ++++++
 3 files changed, 291 insertions(+), 1 deletion(-)
 create mode 100644 drivers/md/dm-interposer.c
 create mode 100644 drivers/md/dm-interposer.h

diff --git a/drivers/md/Makefile b/drivers/md/Makefile
index ef7ddc27685c..bd5b38bee82e 100644
--- a/drivers/md/Makefile
+++ b/drivers/md/Makefile
@@ -5,7 +5,7 @@
 
 dm-mod-y       += dm.o dm-table.o dm-target.o dm-linear.o dm-stripe.o \
                   dm-ioctl.o dm-io.o dm-kcopyd.o dm-sysfs.o dm-stats.o \
-                  dm-rq.o
+                  dm-rq.o dm-interposer.o
 dm-multipath-y += dm-path-selector.o dm-mpath.o
 dm-historical-service-time-y += dm-ps-historical-service-time.o
 dm-io-affinity-y += dm-ps-io-affinity.o
diff --git a/drivers/md/dm-interposer.c b/drivers/md/dm-interposer.c
new file mode 100644
index 000000000000..f4413f0e1f82
--- /dev/null
+++ b/drivers/md/dm-interposer.c
@@ -0,0 +1,250 @@
+// SPDX-License-Identifier: GPL-2.0
+#include <linux/bio.h>
+#include <linux/rwsem.h>
+#include <linux/refcount.h>
+#include <linux/device-mapper.h>
+#include <linux/interval_tree_generic.h>
+
+#include "dm-core.h"
+#include "dm-interposer.h"
+
+#define DM_MSG_PREFIX "interposer"
+
+struct dm_interposer {
+       struct blk_interposer blk_ip;
+
+       struct kref kref;
+       struct rw_semaphore ip_devs_lock;
+       struct rb_root_cached ip_devs_root; /* dm_interposed_dev tree */
+};
+
+/*
+ * Interval tree for device mapper
+ */
+#define START(node) ((node)->start)
+#define LAST(node) ((node)->last)
+INTERVAL_TREE_DEFINE(struct dm_rb_range, node, sector_t, _subtree_last,
+                    START, LAST,, dm_rb);
+
+static DEFINE_MUTEX(dm_interposer_attach_lock);
+
+static void dm_submit_bio_interposer_fn(struct bio *bio)
+{
+       struct dm_interposer *ip;
+       unsigned int noio_flag = 0;
+       sector_t start;
+       sector_t last;
+       struct dm_rb_range *node;
+
+       ip = container_of(bio->bi_disk->interposer, struct dm_interposer, 
blk_ip);
+       start = bio->bi_iter.bi_sector;
+       last = start + dm_sector_div_up(bio->bi_iter.bi_size, SECTOR_SIZE);
+
+       noio_flag = memalloc_noio_save();
+       down_read(&ip->ip_devs_lock);
+       node = dm_rb_iter_first(&ip->ip_devs_root, start, last);
+       while (node) {
+               struct dm_interposed_dev *ip_dev =
+                       container_of(node, struct dm_interposed_dev, node);
+
+               atomic64_inc(&ip_dev->ip_cnt);
+               ip_dev->dm_interpose_bio(ip_dev->context, node->start, bio);
+
+               node = dm_rb_iter_next(node, start, last);
+       }
+       up_read(&ip->ip_devs_lock);
+       memalloc_noio_restore(noio_flag);
+}
+
+void dm_interposer_free(struct kref *kref)
+{
+       struct dm_interposer *ip = container_of(kref, struct dm_interposer, 
kref);
+
+       blk_interposer_detach(&ip->blk_ip, dm_submit_bio_interposer_fn);
+
+       kfree(ip);
+}
+
+struct dm_interposer *dm_interposer_new(struct gendisk *disk)
+{
+       int ret = 0;
+       struct dm_interposer *ip;
+
+       ip = kzalloc(sizeof(struct dm_interposer), GFP_NOIO);
+       if (!ip)
+               return ERR_PTR(-ENOMEM);
+
+       kref_init(&ip->kref);
+       init_rwsem(&ip->ip_devs_lock);
+       ip->ip_devs_root = RB_ROOT_CACHED;
+
+       ret = blk_interposer_attach(disk, &ip->blk_ip, 
dm_submit_bio_interposer_fn);
+       if (ret) {
+               DMERR("Failed to attack blk_interposer.");
+               kref_put(&ip->kref, dm_interposer_free);
+               return ERR_PTR(ret);
+       }
+
+       return ip;
+}
+
+static struct dm_interposer *dm_interposer_get(struct gendisk *disk)
+{
+       struct dm_interposer *ip;
+
+       if (!blk_has_interposer(disk))
+               return NULL;
+
+       if (disk->interposer->ip_submit_bio != dm_submit_bio_interposer_fn) {
+               DMERR("Disks interposer slot already occupied.");
+               return ERR_PTR(-EBUSY);
+       }
+
+       ip = container_of(disk->interposer, struct dm_interposer, blk_ip);
+
+       kref_get(&ip->kref);
+       return ip;
+}
+
+static inline void dm_disk_freeze(struct gendisk *disk)
+{
+       blk_mq_freeze_queue(disk->queue);
+       blk_mq_quiesce_queue(disk->queue);
+}
+
+static inline void dm_disk_unfreeze(struct gendisk *disk)
+{
+       blk_mq_unquiesce_queue(disk->queue);
+       blk_mq_unfreeze_queue(disk->queue);
+}
+
+/**
+ * dm_interposer_dev_init - initialize interposed device
+ * @ip_dev: interposed device
+ * @disk: disk for interposing
+ * @ofs: offset from the beginning of the disk
+ * @len: the length of the part of the disk to which requests will be 
interposed
+ * @context: parameter for interposing callback
+ * @interpose_fn: interposing callback
+ */
+void dm_interposer_dev_init(struct dm_interposed_dev *ip_dev,
+                           struct gendisk *disk, sector_t ofs, sector_t len,
+                           void *context, dm_interpose_bio_t interpose_fn)
+{
+       ip_dev->disk = disk;
+       ip_dev->node.start = ofs;
+       ip_dev->node.last = ofs + len - 1;
+       ip_dev->context = context;
+       ip_dev->dm_interpose_bio = interpose_fn;
+
+       atomic64_set(&ip_dev->ip_cnt, 0);
+}
+
+/**
+ * dm_interposer_dev_attach - attach interposed device to his disk
+ * @ip_dev: interposed device
+ *
+ * Return error code.
+ */
+int dm_interposer_dev_attach(struct dm_interposed_dev *ip_dev)
+{
+       int ret = 0;
+       struct dm_interposer *ip = NULL;
+       unsigned int noio_flag = 0;
+
+       if (!ip_dev)
+               return -EINVAL;
+
+       dm_disk_freeze(ip_dev->disk);
+       mutex_lock(&dm_interposer_attach_lock);
+       noio_flag = memalloc_noio_save();
+
+       ip = dm_interposer_get(ip_dev->disk);
+       if (ip == NULL)
+               ip = dm_interposer_new(ip_dev->disk);
+       if (IS_ERR(ip)) {
+               ret = PTR_ERR(ip);
+               goto out;
+       }
+
+       /* Attach dm_interposed_dev to dm_interposer */
+       down_write(&ip->ip_devs_lock);
+       do {
+               struct dm_rb_range *node;
+
+               /* checking that ip_dev already exists for this region */
+               node = dm_rb_iter_first(&ip->ip_devs_root, ip_dev->node.start, 
ip_dev->node.last);
+               if (node) {
+                       DMERR("Disk part form [%llu] to [%llu] already have 
interposer.",
+                             node->start, node->last);
+
+                       ret = -EBUSY;
+                       break;
+               }
+
+               /* insert ip_dev to ip tree */
+               dm_rb_insert(&ip_dev->node, &ip->ip_devs_root);
+               /* increment ip reference counter */
+               kref_get(&ip->kref);
+       } while (false);
+       up_write(&ip->ip_devs_lock);
+
+       kref_put(&ip->kref, dm_interposer_free);
+
+out:
+       memalloc_noio_restore(noio_flag);
+       mutex_unlock(&dm_interposer_attach_lock);
+       dm_disk_unfreeze(ip_dev->disk);
+
+       return ret;
+}
+
+/**
+ * dm_interposer_detach_dev - detach interposed device from his disk
+ * @ip_dev: interposed device
+ *
+ * Return error code.
+ */
+int dm_interposer_detach_dev(struct dm_interposed_dev *ip_dev)
+{
+       int ret = 0;
+       struct dm_interposer *ip = NULL;
+       unsigned int noio_flag = 0;
+
+       if (!ip_dev)
+               return -EINVAL;
+
+       dm_disk_freeze(ip_dev->disk);
+       mutex_lock(&dm_interposer_attach_lock);
+       noio_flag = memalloc_noio_save();
+
+       ip = dm_interposer_get(ip_dev->disk);
+       if (IS_ERR(ip)) {
+               ret = PTR_ERR(ip);
+               DMERR("Interposer not found.");
+               goto out;
+       }
+       if (unlikely(ip == NULL)) {
+               ret = -ENXIO;
+               DMERR("Interposer not found.");
+               goto out;
+       }
+
+       down_write(&ip->ip_devs_lock);
+       do {
+               dm_rb_remove(&ip_dev->node, &ip->ip_devs_root);
+               /* the reference counter here cannot be zero */
+               kref_put(&ip->kref, dm_interposer_free);
+
+       } while (false);
+       up_write(&ip->ip_devs_lock);
+
+       /* detach and free interposer if it's not needed */
+       kref_put(&ip->kref, dm_interposer_free);
+out:
+       memalloc_noio_restore(noio_flag);
+       mutex_unlock(&dm_interposer_attach_lock);
+       dm_disk_unfreeze(ip_dev->disk);
+
+       return ret;
+}
diff --git a/drivers/md/dm-interposer.h b/drivers/md/dm-interposer.h
new file mode 100644
index 000000000000..77333dc35a47
--- /dev/null
+++ b/drivers/md/dm-interposer.h
@@ -0,0 +1,40 @@
+/* SPDX-License-Identifier: GPL-2.0-or-later */
+/*
+ * Device mapper's interposer.
+ */
+
+#include <linux/rbtree.h>
+
+struct dm_rb_range {
+       struct rb_node node;
+       sector_t start;         /* start sector of rb node */
+       sector_t last;          /* end sector of rb node */
+       sector_t _subtree_last; /* highest sector in subtree of rb node */
+};
+
+typedef void (*dm_interpose_bio_t) (void *context, sector_t start_sect,  
struct bio *bio);
+
+struct dm_interposed_dev {
+       struct gendisk *disk;
+       struct dm_rb_range node;
+       void *context;
+       dm_interpose_bio_t dm_interpose_bio;
+
+       atomic64_t ip_cnt; /* for debug purpose only */
+};
+
+/*
+ * Just initialize structure dm_interposed_dev.
+ */
+void dm_interposer_dev_init(struct dm_interposed_dev *ip_dev,
+                           struct gendisk *disk, sector_t ofs, sector_t len,
+                           void *context, dm_interpose_bio_t interpose_fn);
+
+/*
+ * Attach interposer to his disk.
+ */
+int dm_interposer_dev_attach(struct dm_interposed_dev *ip_dev);
+/*
+ * Detach interposer from his disk.
+ */
+int dm_interposer_detach_dev(struct dm_interposed_dev *ip_dev);
-- 
2.20.1

>From 3d393e8a8d386c009a9b37e5bc40f66cd997c8bd Mon Sep 17 00:00:00 2001
From: Sergei Shtepa <[email protected]>
Date: Tue, 23 Feb 2021 13:46:19 +0100
Subject: [PATCH 5/5] dm: new DM_INTERPOSED_DEV_FLAG

DM_INTERPOSED_DEV_FLAG allow to open underlying device without
FMODE_EXCL flag and create md device "on fly".

Signed-off-by: Sergei Shtepa <[email protected]>
---
 drivers/md/dm-core.h          |   6 ++
 drivers/md/dm-ioctl.c         |   9 +++
 drivers/md/dm-table.c         | 104 +++++++++++++++++++++++++++++++---
 drivers/md/dm.c               |  38 +++++++++----
 include/linux/device-mapper.h |   1 +
 include/uapi/linux/dm-ioctl.h |   2 +
 6 files changed, 139 insertions(+), 21 deletions(-)

diff --git a/drivers/md/dm-core.h b/drivers/md/dm-core.h
index 086d293c2b03..00fad638469f 100644
--- a/drivers/md/dm-core.h
+++ b/drivers/md/dm-core.h
@@ -20,6 +20,8 @@
 
 #define DM_RESERVED_MAX_IOS            1024
 
+struct dm_interposed_dev;
+
 struct dm_kobject_holder {
        struct kobject kobj;
        struct completion completion;
@@ -109,6 +111,10 @@ struct mapped_device {
        bool init_tio_pdu:1;
 
        struct srcu_struct io_barrier;
+
+       /* for interposers logic */
+       bool is_interposed;
+       struct dm_interposed_dev *ip_dev;
 };
 
 void disable_discard(struct mapped_device *md);
diff --git a/drivers/md/dm-ioctl.c b/drivers/md/dm-ioctl.c
index 5e306bba4375..a7be745a6f2c 100644
--- a/drivers/md/dm-ioctl.c
+++ b/drivers/md/dm-ioctl.c
@@ -1267,6 +1267,11 @@ static inline fmode_t get_mode(struct dm_ioctl *param)
        return mode;
 }
 
+static inline bool get_interposer_flag(struct dm_ioctl *param)
+{
+       return (param->flags & DM_INTERPOSED_DEV_FLAG);
+}
+
 static int next_target(struct dm_target_spec *last, uint32_t next, void *end,
                       struct dm_target_spec **spec, char **target_params)
 {
@@ -1338,6 +1343,8 @@ static int table_load(struct file *filp, struct dm_ioctl 
*param, size_t param_si
        if (!md)
                return -ENXIO;
 
+       md->is_interposed = get_interposer_flag(param);
+
        r = dm_table_create(&t, get_mode(param), param->target_count, md);
        if (r)
                goto err;
@@ -2098,6 +2105,8 @@ int __init dm_early_create(struct dm_ioctl *dmi,
        if (r)
                goto err_hash_remove;
 
+       md->is_interposed = get_interposer_flag(dmi);
+
        /* add targets */
        for (i = 0; i < dmi->target_count; i++) {
                r = dm_table_add_target(t, spec_array[i]->target_type,
diff --git a/drivers/md/dm-table.c b/drivers/md/dm-table.c
index 4acf2342f7ad..2d8109876706 100644
--- a/drivers/md/dm-table.c
+++ b/drivers/md/dm-table.c
@@ -6,6 +6,7 @@
  */
 
 #include "dm-core.h"
+#include "dm-interposer.h"
 
 #include <linux/module.h>
 #include <linux/vmalloc.h>
@@ -221,12 +222,13 @@ void dm_table_destroy(struct dm_table *t)
 /*
  * See if we've already got a device in the list.
  */
-static struct dm_dev_internal *find_device(struct list_head *l, dev_t dev)
+static struct dm_dev_internal *find_device(struct list_head *l, dev_t dev, 
bool is_interposed)
 {
        struct dm_dev_internal *dd;
 
        list_for_each_entry (dd, l, list)
-               if (dd->dm_dev->bdev->bd_dev == dev)
+               if ((dd->dm_dev->bdev->bd_dev == dev) &&
+                   (dd->dm_dev->is_interposed == is_interposed))
                        return dd;
 
        return NULL;
@@ -354,6 +356,79 @@ dev_t dm_get_dev_t(const char *path)
 }
 EXPORT_SYMBOL_GPL(dm_get_dev_t);
 
+/*
+ * Redirect bio from interposed device to dm device
+ */
+static void dm_interpose_fn(void *context, sector_t start_sect, struct bio 
*bio)
+{
+       struct mapped_device *md = context;
+
+       /* Set acceptor device.
+        * dm devices have only one block device on disk.
+        * So we can redirect directly to its disk without
+        * calculating dm block device offset.
+        */
+       bio->bi_disk = md->disk;
+
+       /* Remap disks offset */
+       bio->bi_iter.bi_sector -= start_sect;
+
+       /*
+        * bio should be resubmitted.
+        * We can just add bio to bio_list of the current process.
+        * current->bio_list must be initialized when this function is called.
+        * If call submit_bio_noacct(), the bio will be checked twice.
+        */
+       BUG_ON(!current->bio_list);
+       bio_list_add(&current->bio_list[0], bio);
+}
+
+static int _interposer_dev_create(struct block_device *bdev, sector_t ofs, 
sector_t len,
+                                 struct mapped_device *md)
+{
+       int ret;
+
+       DMDEBUG("Create dm interposer.");
+
+       if (md->ip_dev)
+               return -EALREADY;
+
+       if ((ofs + len) > bdev_nr_sectors(bdev))
+               return -ERANGE;
+
+       md->ip_dev = kzalloc(sizeof(struct dm_interposed_dev), GFP_KERNEL);
+       if (!md->ip_dev)
+               return -ENOMEM;
+
+       dm_interposer_dev_init(md->ip_dev, bdev->bd_disk,
+                              get_start_sect(bdev) + ofs, len,
+                              md, dm_interpose_fn);
+
+       ret = dm_interposer_dev_attach(md->ip_dev);
+       if (ret) {
+               DMERR("Failed to attach dm interposer.");
+               kfree(md->ip_dev);
+               md->ip_dev = NULL;
+       }
+
+       return ret;
+}
+
+static void _interposer_dev_remove(struct mapped_device *md)
+{
+       if (!md->ip_dev)
+               return;
+
+       DMDEBUG("Remove dm interposer. %llu bios was interposed.",
+               atomic64_read(&md->ip_dev->ip_cnt));
+
+       if (dm_interposer_detach_dev(md->ip_dev))
+               DMERR("Failed to detach dm interposer.");
+
+       kfree(md->ip_dev);
+       md->ip_dev = NULL;
+}
+
 /*
  * Add a device to the list, or just increment the usage count if
  * it's already present.
@@ -381,7 +456,7 @@ int dm_get_device(struct dm_target *ti, const char *path, 
fmode_t mode,
                        return -ENODEV;
        }
 
-       dd = find_device(&t->devices, dev);
+       dd = find_device(&t->devices, dev, t->md->is_interposed);
        if (!dd) {
                dd = kmalloc(sizeof(*dd), GFP_KERNEL);
                if (!dd)
@@ -394,15 +469,22 @@ int dm_get_device(struct dm_target *ti, const char *path, 
fmode_t mode,
 
                refcount_set(&dd->count, 1);
                list_add(&dd->list, &t->devices);
-               goto out;
-
        } else if (dd->dm_dev->mode != (mode | dd->dm_dev->mode)) {
                r = upgrade_mode(dd, mode, t->md);
                if (r)
                        return r;
+               refcount_inc(&dd->count);
        }
-       refcount_inc(&dd->count);
-out:
+
+       if (t->md->is_interposed) {
+               r = _interposer_dev_create(dd->dm_dev->bdev, ti->begin, 
ti->len, t->md);
+               if (r) {
+                       dm_put_device(ti, dd->dm_dev);
+                       DMERR("Failed to attach dm interposer.");
+                       return r;
+               }
+       }
+
        *result = dd->dm_dev;
        return 0;
 }
@@ -442,6 +524,7 @@ void dm_put_device(struct dm_target *ti, struct dm_dev *d)
 {
        int found = 0;
        struct list_head *devices = &ti->table->devices;
+       struct mapped_device *md = ti->table->md;
        struct dm_dev_internal *dd;
 
        list_for_each_entry(dd, devices, list) {
@@ -452,11 +535,14 @@ void dm_put_device(struct dm_target *ti, struct dm_dev *d)
        }
        if (!found) {
                DMWARN("%s: device %s not in table devices list",
-                      dm_device_name(ti->table->md), d->name);
+                      dm_device_name(md), d->name);
                return;
        }
+       if (md->is_interposed)
+               _interposer_dev_remove(md);
+
        if (refcount_dec_and_test(&dd->count)) {
-               dm_put_table_device(ti->table->md, d);
+               dm_put_table_device(md, d);
                list_del(&dd->list);
                kfree(dd);
        }
diff --git a/drivers/md/dm.c b/drivers/md/dm.c
index 7bac564f3faa..f95226b50fed 100644
--- a/drivers/md/dm.c
+++ b/drivers/md/dm.c
@@ -747,16 +747,24 @@ static int open_table_device(struct table_device *td, 
dev_t dev,
 
        BUG_ON(td->dm_dev.bdev);
 
-       bdev = blkdev_get_by_dev(dev, td->dm_dev.mode | FMODE_EXCL, 
_dm_claim_ptr);
-       if (IS_ERR(bdev))
-               return PTR_ERR(bdev);
+       if (md->is_interposed) {
 
-       r = bd_link_disk_holder(bdev, dm_disk(md));
-       if (r) {
-               blkdev_put(bdev, td->dm_dev.mode | FMODE_EXCL);
-               return r;
+               bdev = blkdev_get_by_dev(dev, td->dm_dev.mode, NULL);
+               if (IS_ERR(bdev))
+                       return PTR_ERR(bdev);
+       } else {
+               bdev = blkdev_get_by_dev(dev, td->dm_dev.mode | FMODE_EXCL, 
_dm_claim_ptr);
+               if (IS_ERR(bdev))
+                       return PTR_ERR(bdev);
+
+               r = bd_link_disk_holder(bdev, dm_disk(md));
+               if (r) {
+                       blkdev_put(bdev, td->dm_dev.mode | FMODE_EXCL);
+                       return r;
+               }
        }
 
+       td->dm_dev.is_interposed = md->is_interposed;
        td->dm_dev.bdev = bdev;
        td->dm_dev.dax_dev = dax_get_by_host(bdev->bd_disk->disk_name);
        return 0;
@@ -770,20 +778,26 @@ static void close_table_device(struct table_device *td, 
struct mapped_device *md
        if (!td->dm_dev.bdev)
                return;
 
-       bd_unlink_disk_holder(td->dm_dev.bdev, dm_disk(md));
-       blkdev_put(td->dm_dev.bdev, td->dm_dev.mode | FMODE_EXCL);
+       if (td->dm_dev.is_interposed)
+               blkdev_put(td->dm_dev.bdev, td->dm_dev.mode);
+       else {
+               bd_unlink_disk_holder(td->dm_dev.bdev, dm_disk(md));
+               blkdev_put(td->dm_dev.bdev, td->dm_dev.mode | FMODE_EXCL);
+       }
        put_dax(td->dm_dev.dax_dev);
        td->dm_dev.bdev = NULL;
        td->dm_dev.dax_dev = NULL;
 }
 
 static struct table_device *find_table_device(struct list_head *l, dev_t dev,
-                                             fmode_t mode)
+                                             fmode_t mode, bool is_interposed)
 {
        struct table_device *td;
 
        list_for_each_entry(td, l, list)
-               if (td->dm_dev.bdev->bd_dev == dev && td->dm_dev.mode == mode)
+               if (td->dm_dev.bdev->bd_dev == dev &&
+                   td->dm_dev.mode == mode &&
+                   td->dm_dev.is_interposed == is_interposed)
                        return td;
 
        return NULL;
@@ -796,7 +810,7 @@ int dm_get_table_device(struct mapped_device *md, dev_t 
dev, fmode_t mode,
        struct table_device *td;
 
        mutex_lock(&md->table_devices_lock);
-       td = find_table_device(&md->table_devices, dev, mode);
+       td = find_table_device(&md->table_devices, dev, mode, 
md->is_interposed);
        if (!td) {
                td = kmalloc_node(sizeof(*td), GFP_KERNEL, md->numa_node_id);
                if (!td) {
diff --git a/include/linux/device-mapper.h b/include/linux/device-mapper.h
index 61a66fb8ebb3..c264b6beb12b 100644
--- a/include/linux/device-mapper.h
+++ b/include/linux/device-mapper.h
@@ -150,6 +150,7 @@ struct dm_dev {
        struct block_device *bdev;
        struct dax_device *dax_dev;
        fmode_t mode;
+       bool is_interposed;
        char name[16];
 };
 
diff --git a/include/uapi/linux/dm-ioctl.h b/include/uapi/linux/dm-ioctl.h
index 4933b6b67b85..033a84a9b0b7 100644
--- a/include/uapi/linux/dm-ioctl.h
+++ b/include/uapi/linux/dm-ioctl.h
@@ -362,4 +362,6 @@ enum {
  */
 #define DM_INTERNAL_SUSPEND_FLAG       (1 << 18) /* Out */
 
+#define DM_INTERPOSED_DEV_FLAG         (1 << 19) /* In */
+
 #endif                         /* _LINUX_DM_IOCTL_H */
-- 
2.20.1

--
dm-devel mailing list
[email protected]
https://listman.redhat.com/mailman/listinfo/dm-devel

Reply via email to