From: Hannes Reinecke <h...@suse.de>

Implement a RB-Tree holding a zoned block device zone information
(struct blk_zone) and add support functions for maintaining the
RB-Tree and manipulating zone structs. The block layer support does
not differentiate between host-aware and host-managed devices. The
different constraints for these different zone models are handled
by the generic SCSI layer sd driver down the stack.

Signed-off-by: Hannes Reinecke <h...@suse.de>

Changelog (Damien):
* Changed struct blk_zone to be more compact (64B)
* Changed zone locking to use bit_spin_lock in place of a regular
  spinlock
* Request zone operations to the underlying block device driver
  through BIO operations with the operation codes REQ_OP_ZONE_*.

Signed-off-by: Damien Le Moal <damien.lem...@hgst.com>
---
 block/Kconfig          |   8 ++
 block/Makefile         |   1 +
 block/blk-core.c       |   4 +
 block/blk-zoned.c      | 338 +++++++++++++++++++++++++++++++++++++++++++++++++
 include/linux/blkdev.h | 113 +++++++++++++++++
 5 files changed, 464 insertions(+)
 create mode 100644 block/blk-zoned.c

diff --git a/block/Kconfig b/block/Kconfig
index 161491d..c3a18f0 100644
--- a/block/Kconfig
+++ b/block/Kconfig
@@ -88,6 +88,14 @@ config BLK_DEV_INTEGRITY
        T10/SCSI Data Integrity Field or the T13/ATA External Path
        Protection.  If in doubt, say N.
 
+config BLK_DEV_ZONED
+       bool "Zoned block device support"
+       ---help---
+       Block layer zoned block device support. This option enables
+       support for ZAC/ZBC host-managed and host-aware zoned block devices.
+
+       Say yes here if you have a ZAC or ZBC storage device.
+
 config BLK_DEV_THROTTLING
        bool "Block layer bio throttling support"
        depends on BLK_CGROUP=y
diff --git a/block/Makefile b/block/Makefile
index 9eda232..aee67fa 100644
--- a/block/Makefile
+++ b/block/Makefile
@@ -22,4 +22,5 @@ obj-$(CONFIG_IOSCHED_CFQ)     += cfq-iosched.o
 obj-$(CONFIG_BLOCK_COMPAT)     += compat_ioctl.o
 obj-$(CONFIG_BLK_CMDLINE_PARSER)       += cmdline-parser.o
 obj-$(CONFIG_BLK_DEV_INTEGRITY) += bio-integrity.o blk-integrity.o t10-pi.o
+obj-$(CONFIG_BLK_DEV_ZONED)    += blk-zoned.o
 
diff --git a/block/blk-core.c b/block/blk-core.c
index 4a7f7ba..2c5d069d 100644
--- a/block/blk-core.c
+++ b/block/blk-core.c
@@ -590,6 +590,8 @@ void blk_cleanup_queue(struct request_queue *q)
                blk_mq_free_queue(q);
        percpu_ref_exit(&q->q_usage_counter);
 
+       blk_drop_zones(q);
+
        spin_lock_irq(lock);
        if (q->queue_lock != &q->__queue_lock)
                q->queue_lock = &q->__queue_lock;
@@ -728,6 +730,8 @@ struct request_queue *blk_alloc_queue_node(gfp_t gfp_mask, 
int node_id)
 #endif
        INIT_DELAYED_WORK(&q->delay_work, blk_delay_work);
 
+       blk_init_zones(q);
+
        kobject_init(&q->kobj, &blk_queue_ktype);
 
        mutex_init(&q->sysfs_lock);
diff --git a/block/blk-zoned.c b/block/blk-zoned.c
new file mode 100644
index 0000000..a107940
--- /dev/null
+++ b/block/blk-zoned.c
@@ -0,0 +1,338 @@
+/*
+ * Zoned block device handling
+ *
+ * Copyright (c) 2015, Hannes Reinecke
+ * Copyright (c) 2015, SUSE Linux GmbH
+ *
+ * Copyright (c) 2016, Damien Le Moal
+ * Copyright (c) 2016, Western Digital
+ */
+
+#include <linux/kernel.h>
+#include <linux/module.h>
+#include <linux/rbtree.h>
+#include <linux/blkdev.h>
+
+void blk_init_zones(struct request_queue *q)
+{
+       spin_lock_init(&q->zones_lock);
+       q->zones = RB_ROOT;
+}
+
+/**
+ * blk_drop_zones - Empty a zoned device zone tree.
+ * @q: queue of the zoned device to operate on
+ *
+ * Free all zone descriptors added to the queue zone tree.
+ */
+void blk_drop_zones(struct request_queue *q)
+{
+       struct rb_root *root = &q->zones;
+       struct blk_zone *zone, *next;
+
+       rbtree_postorder_for_each_entry_safe(zone, next, root, node)
+               kfree(zone);
+       q->zones = RB_ROOT;
+}
+EXPORT_SYMBOL_GPL(blk_drop_zones);
+
+/**
+ * blk_insert_zone - Add a new zone struct to the queue RB-tree.
+ * @q: queue of the zoned device to operate on
+ * @new_zone: The zone struct to add
+ *
+ * If @new_zone is not already added to the zone tree, add it.
+ * Otherwise, return the existing entry.
+ */
+struct blk_zone *blk_insert_zone(struct request_queue *q,
+                                struct blk_zone *new_zone)
+{
+       struct rb_root *root = &q->zones;
+       struct rb_node **new = &(root->rb_node), *parent = NULL;
+       struct blk_zone *zone = NULL;
+       unsigned long flags;
+
+       spin_lock_irqsave(&q->zones_lock, flags);
+
+       /* Figure out where to put new node */
+       while (*new) {
+               zone = container_of(*new, struct blk_zone, node);
+               parent = *new;
+               if (new_zone->start + new_zone->len <= zone->start)
+                       new = &((*new)->rb_left);
+               else if (new_zone->start >= zone->start + zone->len)
+                       new = &((*new)->rb_right);
+               else
+                       /* Return existing zone */
+                       break;
+               zone = NULL;
+       }
+
+       if (!zone) {
+               /* No existing zone: add new node and rebalance tree */
+               rb_link_node(&new_zone->node, parent, new);
+               rb_insert_color(&new_zone->node, root);
+       }
+
+       spin_unlock_irqrestore(&q->zones_lock, flags);
+
+       return zone;
+}
+EXPORT_SYMBOL_GPL(blk_insert_zone);
+
+/**
+ * blk_lookup_zone - Search a zone in a zoned device zone tree.
+ * @q: queue of the zoned device tree to search
+ * @sector: A sector within the zone to search for
+ *
+ * Search the zone containing @sector in the zone tree owned
+ * by @q. NULL is returned if the zone is not found. Since this
+ * can be called concurrently with blk_insert_zone during device
+ * initialization, the tree traversal is protected using the
+ * zones_lock of the queue.
+ */
+struct blk_zone *blk_lookup_zone(struct request_queue *q, sector_t sector)
+{
+       struct rb_root *root = &q->zones;
+       struct rb_node *node = root->rb_node;
+       struct blk_zone *zone = NULL;
+       unsigned long flags;
+
+       spin_lock_irqsave(&q->zones_lock, flags);
+
+       while (node) {
+               zone = container_of(node, struct blk_zone, node);
+               if (sector < zone->start)
+                       node = node->rb_left;
+               else if (sector >= zone->start + zone->len)
+                       node = node->rb_right;
+               else
+                       break;
+               zone = NULL;
+       }
+
+       spin_unlock_irqrestore(&q->zones_lock, flags);
+
+       return zone;
+}
+EXPORT_SYMBOL_GPL(blk_lookup_zone);
+
+/**
+ * Execute a zone operation (REQ_OP_ZONE*)
+ */
+static int blkdev_issue_zone_operation(struct block_device *bdev,
+                                      unsigned int op,
+                                      sector_t sector, sector_t nr_sects,
+                                      gfp_t gfp_mask)
+{
+       struct bio *bio;
+       int ret;
+
+       if (!bdev_zoned(bdev))
+               return -EOPNOTSUPP;
+
+       /*
+        * Make sure bi_size does not overflow because
+        * of some weird very large zone size.
+        */
+       if (nr_sects && (unsigned long long)nr_sects << 9 > UINT_MAX)
+               return -EINVAL;
+
+       bio = bio_alloc(gfp_mask, 1);
+       if (!bio)
+               return -ENOMEM;
+
+       bio->bi_iter.bi_sector = sector;
+       bio->bi_iter.bi_size = nr_sects << 9;
+       bio->bi_vcnt = 0;
+       bio->bi_bdev = bdev;
+       bio_set_op_attrs(bio, op, 0);
+
+       ret = submit_bio_wait(bio);
+
+       bio_put(bio);
+
+       return ret;
+}
+
+/**
+ * blkdev_update_zones - Force an update of a device zone information
+ * @bdev:      Target block device
+ *
+ * Force an update of all zones information of @bdev. This call does not
+ * block waiting for the update to complete. On return, all zones are only
+ * marked as "in-update". Waiting on the zone update to complete can be done
+ * on a per zone basis using the function blk_wait_for_zone_update.
+ */
+int blkdev_update_zones(struct block_device *bdev,
+                       gfp_t gfp_mask)
+{
+       return blkdev_issue_zone_operation(bdev, REQ_OP_ZONE_REPORT,
+                                          0, 0, gfp_mask);
+}
+
+/*
+ * Wait for a zone update to complete.
+ */
+static void __blk_wait_for_zone_update(struct blk_zone *zone)
+{
+       might_sleep();
+       if (test_bit(BLK_ZONE_IN_UPDATE, &zone->flags))
+               wait_on_bit_io(&zone->flags, BLK_ZONE_IN_UPDATE,
+                              TASK_UNINTERRUPTIBLE);
+}
+
+/**
+ * blk_wait_for_zone_update - Wait for a zone information update
+ * @zone: The zone to wait for
+ *
+ * This must be called with the zone lock held. If @zone is not
+ * under update, returns immediately. Otherwise, wait for the
+ * update flag to be cleared on completion of the zone information
+ * update by the device driver.
+ */
+void blk_wait_for_zone_update(struct blk_zone *zone)
+{
+       WARN_ON_ONCE(!test_bit(BLK_ZONE_LOCKED, &zone->flags));
+       while (test_bit(BLK_ZONE_IN_UPDATE, &zone->flags)) {
+               blk_unlock_zone(zone);
+               __blk_wait_for_zone_update(zone);
+               blk_lock_zone(zone);
+       }
+}
+
+/**
+ * blkdev_report_zone - Get a zone information
+ * @bdev:      Target block device
+ * @sector:    A sector of the zone to report
+ * @update:    Force an update of the zone information
+ * @gfp_mask:  Memory allocation flags (for bio_alloc)
+ *
+ * Get a zone from the zone cache. And return it.
+ * If update is requested, issue a report zone operation
+ * and wait for the zone information to be updated.
+ */
+struct blk_zone *blkdev_report_zone(struct block_device *bdev,
+                                   sector_t sector,
+                                   bool update,
+                                   gfp_t gfp_mask)
+{
+       struct request_queue *q = bdev_get_queue(bdev);
+       struct blk_zone *zone;
+       int ret;
+
+       zone = blk_lookup_zone(q, sector);
+       if (!zone)
+               return ERR_PTR(-ENXIO);
+
+       if (update) {
+               ret = blkdev_issue_zone_operation(bdev, REQ_OP_ZONE_REPORT,
+                                                 zone->start, zone->len,
+                                                 gfp_mask);
+               if (ret)
+                       return ERR_PTR(ret);
+               __blk_wait_for_zone_update(zone);
+       }
+
+       return zone;
+}
+
+/**
+ * Execute a zone action (open, close, reset or finish).
+ */
+static int blkdev_issue_zone_action(struct block_device *bdev,
+                                   sector_t sector, unsigned int op,
+                                   gfp_t gfp_mask)
+{
+       struct request_queue *q = bdev_get_queue(bdev);
+       struct blk_zone *zone;
+       sector_t nr_sects;
+       int ret;
+
+       if (!blk_queue_zoned(q))
+               return -EOPNOTSUPP;
+
+       if (sector == ~0ULL) {
+               /* All zones */
+               sector = 0;
+               nr_sects = 0;
+       } else {
+               /* This zone */
+               zone = blk_lookup_zone(q, sector);
+               if (!zone)
+                       return -ENXIO;
+               sector = zone->start;
+               nr_sects = zone->len;
+       }
+
+       ret = blkdev_issue_zone_operation(bdev, op, sector,
+                                         nr_sects, gfp_mask);
+       if (ret == 0 && !nr_sects)
+               blkdev_update_zones(bdev, gfp_mask);
+
+       return ret;
+}
+
+/**
+ * blkdev_reset_zone - Reset a zone write pointer
+ * @bdev:      target block device
+ * @sector:    A sector of the zone to reset or ~0ul for all zones.
+ * @gfp_mask:  memory allocation flags (for bio_alloc)
+ *
+ * Description:
+ *    Reset a zone or all zones write pointer.
+ */
+int blkdev_reset_zone(struct block_device *bdev,
+                     sector_t sector, gfp_t gfp_mask)
+{
+       return blkdev_issue_zone_action(bdev, sector, REQ_OP_ZONE_RESET,
+                                       gfp_mask);
+}
+
+/**
+ * blkdev_open_zone - Explicitely open a zone
+ * @bdev:      target block device
+ * @sector:    A sector of the zone to open or ~0ul for all zones.
+ * @gfp_mask:  memory allocation flags (for bio_alloc)
+ *
+ * Description:
+ *    Open a zone or all possible zones.
+ */
+int blkdev_open_zone(struct block_device *bdev,
+                    sector_t sector, gfp_t gfp_mask)
+{
+       return blkdev_issue_zone_action(bdev, sector, REQ_OP_ZONE_OPEN,
+                                       gfp_mask);
+}
+
+/**
+ * blkdev_close_zone - Close an open zone
+ * @bdev:      target block device
+ * @sector:    A sector of the zone to close or ~0ul for all zones.
+ * @gfp_mask:  memory allocation flags (for bio_alloc)
+ *
+ * Description:
+ *    Close a zone or all open zones.
+ */
+int blkdev_close_zone(struct block_device *bdev,
+                     sector_t sector, gfp_t gfp_mask)
+{
+       return blkdev_issue_zone_action(bdev, sector, REQ_OP_ZONE_CLOSE,
+                                       gfp_mask);
+}
+
+/**
+ * blkdev_finish_zone - Finish a zone (make it full)
+ * @bdev:      target block device
+ * @sector:    A sector of the zone to close or ~0ul for all zones.
+ * @gfp_mask:  memory allocation flags (for bio_alloc)
+ *
+ * Description:
+ *    Finish one zone or all possible zones.
+ */
+int blkdev_finish_zone(struct block_device *bdev,
+                      sector_t sector, gfp_t gfp_mask)
+{
+       return blkdev_issue_zone_action(bdev, sector, REQ_OP_ZONE_FINISH,
+                                       gfp_mask);
+}
diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
index 1c74b19..1165594 100644
--- a/include/linux/blkdev.h
+++ b/include/linux/blkdev.h
@@ -24,6 +24,7 @@
 #include <linux/rcupdate.h>
 #include <linux/percpu-refcount.h>
 #include <linux/scatterlist.h>
+#include <linux/bit_spinlock.h>
 
 struct module;
 struct scsi_ioctl_command;
@@ -302,6 +303,113 @@ struct queue_limits {
        unsigned char           zoned;
 };
 
+#ifdef CONFIG_BLK_DEV_ZONED
+
+enum blk_zone_type {
+       BLK_ZONE_TYPE_UNKNOWN,
+       BLK_ZONE_TYPE_CONVENTIONAL,
+       BLK_ZONE_TYPE_SEQWRITE_REQ,
+       BLK_ZONE_TYPE_SEQWRITE_PREF,
+};
+
+enum blk_zone_cond {
+       BLK_ZONE_COND_NO_WP,
+       BLK_ZONE_COND_EMPTY,
+       BLK_ZONE_COND_IMP_OPEN,
+       BLK_ZONE_COND_EXP_OPEN,
+       BLK_ZONE_COND_CLOSED,
+       BLK_ZONE_COND_READONLY = 0xd,
+       BLK_ZONE_COND_FULL,
+       BLK_ZONE_COND_OFFLINE,
+};
+
+enum blk_zone_flags {
+       BLK_ZONE_LOCKED,
+       BLK_ZONE_WRITE_LOCKED,
+       BLK_ZONE_IN_UPDATE,
+};
+
+/**
+ * Zone descriptor. On 64-bits architectures,
+ * this will align on sizeof(long), i.e. 64 B,
+ * and use 64 B.
+ */
+struct blk_zone {
+       struct rb_node  node;
+       unsigned long   flags;
+       sector_t        len;
+       sector_t        start;
+       sector_t        wp;
+       unsigned int    type : 4;
+       unsigned int    cond : 4;
+       unsigned int    non_seq : 1;
+       unsigned int    reset : 1;
+};
+
+#define blk_zone_is_seq_req(z) ((z)->type == BLK_ZONE_TYPE_SEQWRITE_REQ)
+#define blk_zone_is_seq_pref(z)        ((z)->type == 
BLK_ZONE_TYPE_SEQWRITE_PREF)
+#define blk_zone_is_seq(z)     (blk_zone_is_seq_req(z) || 
blk_zone_is_seq_pref(z))
+#define blk_zone_is_conv(z)    ((z)->type == BLK_ZONE_TYPE_CONVENTIONAL)
+
+#define blk_zone_is_readonly(z)        ((z)->cond == BLK_ZONE_COND_READONLY)
+#define blk_zone_is_offline(z)         ((z)->cond == BLK_ZONE_COND_OFFLINE)
+#define blk_zone_is_full(z)    ((z)->cond == BLK_ZONE_COND_FULL)
+#define blk_zone_is_empty(z)   ((z)->cond == BLK_ZONE_COND_EMPTY)
+#define blk_zone_is_open(z)    ((z)->cond == BLK_ZONE_COND_EXP_OPEN)
+
+static inline void blk_lock_zone(struct blk_zone *zone)
+{
+       bit_spin_lock(BLK_ZONE_LOCKED, &zone->flags);
+}
+
+static inline int blk_trylock_zone(struct blk_zone *zone)
+{
+       return bit_spin_trylock(BLK_ZONE_LOCKED, &zone->flags);
+}
+
+static inline void blk_unlock_zone(struct blk_zone *zone)
+{
+       bit_spin_unlock(BLK_ZONE_LOCKED, &zone->flags);
+}
+
+static inline int blk_try_write_lock_zone(struct blk_zone *zone)
+{
+       return !test_and_set_bit(BLK_ZONE_WRITE_LOCKED, &zone->flags);
+}
+
+static inline void blk_write_unlock_zone(struct blk_zone *zone)
+{
+       clear_bit_unlock(BLK_ZONE_WRITE_LOCKED, &zone->flags);
+       smp_mb__after_atomic();
+}
+
+extern void blk_init_zones(struct request_queue *);
+extern void blk_drop_zones(struct request_queue *);
+extern struct blk_zone *blk_insert_zone(struct request_queue *,
+                                       struct blk_zone *);
+extern struct blk_zone *blk_lookup_zone(struct request_queue *, sector_t);
+
+extern int blkdev_update_zones(struct block_device *, gfp_t);
+extern void blk_wait_for_zone_update(struct blk_zone *);
+#define blk_zone_in_update(z)  test_bit(BLK_ZONE_IN_UPDATE, &(z)->flags)
+static inline void blk_clear_zone_update(struct blk_zone *zone)
+{
+       clear_bit_unlock(BLK_ZONE_IN_UPDATE, &zone->flags);
+       smp_mb__after_atomic();
+       wake_up_bit(&zone->flags, BLK_ZONE_IN_UPDATE);
+}
+
+extern struct blk_zone *blkdev_report_zone(struct block_device *,
+                                          sector_t, bool, gfp_t);
+extern int blkdev_reset_zone(struct block_device *, sector_t, gfp_t);
+extern int blkdev_open_zone(struct block_device *, sector_t, gfp_t);
+extern int blkdev_close_zone(struct block_device *, sector_t, gfp_t);
+extern int blkdev_finish_zone(struct block_device *, sector_t, gfp_t);
+#else /* CONFIG_BLK_DEV_ZONED */
+static inline void blk_init_zones(struct request_queue *q) { };
+static inline void blk_drop_zones(struct request_queue *q) { };
+#endif /* CONFIG_BLK_DEV_ZONED */
+
 struct request_queue {
        /*
         * Together with queue_head for cacheline sharing
@@ -404,6 +512,11 @@ struct request_queue {
        unsigned int            nr_pending;
 #endif
 
+#ifdef CONFIG_BLK_DEV_ZONED
+       spinlock_t              zones_lock;
+       struct rb_root          zones;
+#endif
+
        /*
         * queue settings
         */
-- 
2.7.4

--
To unsubscribe from this list: send the line "unsubscribe linux-block" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Reply via email to