If drive does not support zone-append natively, enable emulation using
regular write.
Make emulated zone-append cmd write-lock the zone, preventing
concurrent append/write on the same zone.

To determine the start-lba for such writes, an array of 32 bit
zone-relative write-pointer (WP) positions is attached with namespace.
This cached WP-position is updated on successful completion as follows:
- APPEND/WRITE/WRITE_ZEROS/WRITE_SAME update it by number of sectors
(512b) copied
- ZONE_RESET updates it to 0 for target zone. ZONE_RESET_ALL does the
same for all zones.
- ZONE_FINISH sets it to zone-size.

On failed-completion for above requests, cached WP-position of target zone
is marked invalid. On subsequent zone-append to that zone, WP position is
refreshed by querying it from device (i.e. zone-report).

If emulated-append cannot immediately proceed due to zone write-lock
or invalid WP position, block-layer is asked to retry it.

Signed-off-by: Kanchan Joshi <josh...@samsung.com>
Signed-off-by: Nitesh Shetty <nj.she...@samsung.com>
Signed-off-by: SelvaKumar S <selvakuma...@samsung.com>
Signed-off-by: Javier Gonzalez <javier.g...@samsung.com>
---
 drivers/nvme/host/core.c |  41 +++++-
 drivers/nvme/host/nvme.h |  60 ++++++++
 drivers/nvme/host/zns.c  | 306 ++++++++++++++++++++++++++++++++++++++-
 3 files changed, 398 insertions(+), 9 deletions(-)

diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c
index 88cff309d8e4..78faddf444c3 100644
--- a/drivers/nvme/host/core.c
+++ b/drivers/nvme/host/core.c
@@ -287,10 +287,17 @@ void nvme_complete_rq(struct request *req)
                        nvme_retry_req(req);
                        return;
                }
-       } else if (IS_ENABLED(CONFIG_BLK_DEV_ZONED) &&
-                  req_op(req) == REQ_OP_ZONE_APPEND) {
-               req->__sector = nvme_lba_to_sect(req->q->queuedata,
-                       le64_to_cpu(nvme_req(req)->result.u64));
+       } else if (IS_ENABLED(CONFIG_BLK_DEV_ZONED)) {
+               bool need_wp_offset_update = false;
+               struct nvme_ns *ns = req->q->queuedata;
+               /* append-emulation requires wp update for some cmds*/
+               if (ns && nvme_is_append_emulated(ns))
+                       need_wp_offset_update = nvme_need_zone_wp_update(req);
+               if (need_wp_offset_update)
+                       nvme_zone_wp_update(ns, req, status);
+               else if (req_op(req) == REQ_OP_ZONE_APPEND)
+                       req->__sector = nvme_lba_to_sect(ns,
+                                       le64_to_cpu(nvme_req(req)->result.u64));
        }
 
        nvme_trace_bio_complete(req, status);
@@ -456,6 +463,8 @@ static void nvme_free_ns(struct kref *kref)
 {
        struct nvme_ns *ns = container_of(kref, struct nvme_ns, kref);
 
+       if (nvme_is_append_emulated(ns))
+               nvme_teardown_append_emulate(ns);
        if (ns->ndev)
                nvme_nvm_unregister(ns);
 
@@ -809,7 +818,15 @@ blk_status_t nvme_setup_cmd(struct nvme_ns *ns, struct 
request *req,
                ret = nvme_setup_rw(ns, req, cmd, nvme_cmd_write);
                break;
        case REQ_OP_ZONE_APPEND:
-               ret = nvme_setup_rw(ns, req, cmd, nvme_cmd_zone_append);
+               if (!nvme_is_append_emulated(ns))
+                       ret = nvme_setup_rw(ns, req, cmd, nvme_cmd_zone_append);
+               else {
+                       /* prepare append like write, and adjust lba afterwards 
*/
+                       ret = nvme_setup_rw(ns, req, cmd, nvme_cmd_write);
+                       if (ret)
+                               break;
+                       ret = nvme_append_to_write(ns, req, cmd);
+               }
                break;
        default:
                WARN_ON_ONCE(1);
@@ -2150,7 +2167,7 @@ static int nvme_revalidate_disk(struct gendisk *disk)
                struct nvme_ns *ns = disk->private_data;
                struct nvme_ctrl *ctrl = ns->ctrl;
 
-               ret = blk_revalidate_disk_zones(disk, NULL);
+               ret = nvme_revalidate_disk_zones(disk);
                if (!ret)
                        blk_queue_max_zone_append_sectors(disk->queue,
                                                          
ctrl->max_zone_append);
@@ -3900,6 +3917,18 @@ static void nvme_alloc_ns(struct nvme_ctrl *ctrl, 
unsigned nsid)
        if (__nvme_revalidate_disk(disk, id))
                goto out_put_disk;
 
+       /* setup append-emulation if required */
+       if (nvme_is_append_emulated(ns)) {
+               ret = nvme_setup_append_emulate(ns);
+               if (ret) {
+                       dev_warn(ns->ctrl->device,
+                               "append-emulation failed, zoned namespace:%d\n",
+                               ns->head->ns_id);
+                       nvme_clear_append_emulated(ns);
+                       goto out_put_disk;
+               }
+       }
+
        if ((ctrl->quirks & NVME_QUIRK_LIGHTNVM) && id->vs[0] == 0x1) {
                ret = nvme_nvm_register(ns, disk_name, node);
                if (ret) {
diff --git a/drivers/nvme/host/nvme.h b/drivers/nvme/host/nvme.h
index ebb8c3ed3885..c84d418fb001 100644
--- a/drivers/nvme/host/nvme.h
+++ b/drivers/nvme/host/nvme.h
@@ -421,6 +421,19 @@ enum nvme_ns_features {
        NVME_NS_METADATA_SUPPORTED = 1 << 1, /* support getting generated md */
 };
 
+#ifdef CONFIG_BLK_DEV_ZONED
+struct nvme_za_emul {
+       unsigned int nr_zones;
+       spinlock_t zones_wp_offset_lock;
+       u32 *zones_wp_offset;
+       u32 *rev_wp_offset;
+       struct work_struct zone_wp_offset_work;
+       char *zone_wp_update_buf;
+       struct mutex rev_mutex;
+       struct nvme_ns *ns;
+};
+#endif
+
 struct nvme_ns {
        struct list_head list;
 
@@ -443,6 +456,10 @@ struct nvme_ns {
        u8 pi_type;
 #ifdef CONFIG_BLK_DEV_ZONED
        u64 zsze;
+       /* set if append needs to be emulated */
+       u8 append_emulate;
+       /* contains all other append-emulation fields */
+       struct nvme_za_emul *za_emul;
 #endif
        unsigned long features;
        unsigned long flags;
@@ -759,9 +776,52 @@ int nvme_report_zones(struct gendisk *disk, sector_t 
sector,
 blk_status_t nvme_setup_zone_mgmt_send(struct nvme_ns *ns, struct request *req,
                                       struct nvme_command *cmnd,
                                       enum nvme_zone_mgmt_action action);
+
+int nvme_revalidate_disk_zones(struct gendisk *disk);
+/* append-emulation only helpers */
+int nvme_setup_append_emulate(struct nvme_ns *ns);
+void nvme_teardown_append_emulate(struct nvme_ns *ns);
+blk_status_t nvme_append_to_write(struct nvme_ns *ns, struct request *req,
+                                 struct nvme_command *cmd);
+bool nvme_need_zone_wp_update(struct request *rq);
+void nvme_zone_wp_update(struct nvme_ns *ns, struct request *rq,
+                        blk_status_t status);
+void nvme_set_append_emulated(struct nvme_ns *ns);
+void nvme_clear_append_emulated(struct nvme_ns *ns);
+int nvme_is_append_emulated(struct nvme_ns *ns);
 #else
 #define nvme_report_zones NULL
 
+static inline void nvme_set_append_emulated(struct nvme_ns *ns) {}
+
+static inline void nvme_clear_append_emulated(struct nvme_ns *ns) {}
+
+static inline int nvme_is_append_emulated(struct nvme_ns *ns)
+{
+       return 0;
+}
+
+static inline int nvme_setup_append_emulate(struct nvme_ns *ns)
+{
+       return 0;
+}
+
+static inline void nvme_teardown_append_emulate(struct nvme_ns *ns) {}
+
+static inline blk_status_t nvme_append_to_write(struct nvme_ns *ns, struct 
request *req,
+                                               struct nvme_command *cmd)
+{
+       return BLK_STS_NOTSUPP;
+}
+
+static inline bool nvme_need_zone_wp_update(struct request *rq)
+{
+       return false;
+}
+
+static inline void nvme_zone_wp_update(struct nvme_ns *ns, struct request *rq,
+                        blk_status_t status) {}
+
 static inline blk_status_t nvme_setup_zone_mgmt_send(struct nvme_ns *ns,
                struct request *req, struct nvme_command *cmnd,
                enum nvme_zone_mgmt_action action)
diff --git a/drivers/nvme/host/zns.c b/drivers/nvme/host/zns.c
index cabd870fb64e..0b1e9f62045a 100644
--- a/drivers/nvme/host/zns.c
+++ b/drivers/nvme/host/zns.c
@@ -7,6 +7,10 @@
 #include <linux/vmalloc.h>
 #include "nvme.h"
 
+/* used for append-emulation */
+#define ZNS_INVALID_WP_OFST  (~0u)
+#define ZNS_UPDATING_WP_OFST (ZNS_INVALID_WP_OFST - 1)
+
 static int nvme_set_max_append(struct nvme_ctrl *ctrl)
 {
        struct nvme_command c = { };
@@ -44,13 +48,14 @@ int nvme_update_zone_info(struct gendisk *disk, struct 
nvme_ns *ns,
        struct nvme_id_ns_zns *id;
        int status;
 
-       /* Driver requires zone append support */
+       /* Driver does append-emulation if drive does not support zone-append */
        if (!(le32_to_cpu(log->iocs[nvme_cmd_zone_append]) &
                        NVME_CMD_EFFECTS_CSUPP)) {
                dev_warn(ns->ctrl->device,
-                       "append not supported for zoned namespace:%d\n",
+                       "append is emulated for zoned namespace:%d\n",
                        ns->head->ns_id);
-               return -EINVAL;
+               /* activate append-emulation */
+               nvme_set_append_emulated(ns);
        }
 
        /* Lazily query controller append limit for the first zoned namespace */
@@ -255,3 +260,298 @@ blk_status_t nvme_setup_zone_mgmt_send(struct nvme_ns 
*ns, struct request *req,
 
        return BLK_STS_OK;
 }
+
+static void nvme_revalidate_zones_cb(struct gendisk *disk)
+{
+       struct nvme_ns_head *head = NULL;
+       struct nvme_ns *ns;
+       int srcu_idx;
+
+       ns = nvme_get_ns_from_disk(disk, &head, &srcu_idx);
+       swap(ns->za_emul->zones_wp_offset, ns->za_emul->rev_wp_offset);
+       nvme_put_ns_from_disk(head, srcu_idx);
+}
+
+inline int nvme_is_append_emulated(struct nvme_ns *ns)
+{
+       return (ns->append_emulate == 1);
+}
+
+inline void nvme_set_append_emulated(struct nvme_ns *ns)
+{
+       ns->append_emulate = 1;
+}
+
+inline void nvme_clear_append_emulated(struct nvme_ns *ns)
+{
+       ns->append_emulate = 0;
+}
+
+int nvme_revalidate_disk_zones(struct gendisk *disk)
+{
+       int ret = 0;
+       struct nvme_ns *ns = disk->private_data;
+
+       if (!nvme_is_append_emulated(ns))
+               ret = blk_revalidate_disk_zones(disk, NULL);
+       else {
+               struct nvme_za_emul *za_emul = ns->za_emul;
+               unsigned int nr_zones;
+
+               /* serialize multiple revalidate calls */
+               mutex_lock(&za_emul->rev_mutex);
+               nr_zones = get_capacity(disk) >> ilog2(ns->zsze);
+
+               /* avoid rescan zones if possible */
+               if (nr_zones == za_emul->nr_zones &&
+                               disk->queue->nr_zones == nr_zones) {
+                       mutex_unlock(&za_emul->rev_mutex);
+                       goto out;
+               }
+               za_emul->rev_wp_offset = kvcalloc(nr_zones,
+                                               sizeof(u32), GFP_NOIO);
+               if (!za_emul->rev_wp_offset) {
+                       ret = -ENOMEM;
+                       goto unlock;
+               }
+               ret = blk_revalidate_disk_zones(disk,
+                               nvme_revalidate_zones_cb);
+               /* rev_wp_offset has been swapped with zones_wp_offset */
+               kvfree(za_emul->rev_wp_offset);
+               za_emul->rev_wp_offset = NULL;
+unlock:
+               mutex_unlock(&za_emul->rev_mutex);
+       }
+out:
+       return ret;
+}
+
+static unsigned int nvme_get_zone_wp_offset(struct blk_zone *zone)
+{
+       switch (zone->cond) {
+       case BLK_ZONE_COND_IMP_OPEN:
+       case BLK_ZONE_COND_EXP_OPEN:
+       case BLK_ZONE_COND_CLOSED:
+               return zone->wp - zone->start;
+       case BLK_ZONE_COND_FULL:
+               return zone->len;
+       case BLK_ZONE_COND_EMPTY:
+       case BLK_ZONE_COND_OFFLINE:
+       case BLK_ZONE_COND_READONLY:
+       default:
+               /*
+                * Offline and read-only zones do not have a valid
+                * write pointer. Use 0 as for an empty zone.
+                */
+               return 0;
+       }
+}
+
+static int nvme_update_wp_offset_cb(struct blk_zone *zone, unsigned int idx,
+                                   void *data)
+{
+       struct nvme_za_emul *za_emul = data;
+
+       lockdep_assert_held(&za_emul->zones_wp_offset_lock);
+       za_emul->zones_wp_offset[idx] = nvme_get_zone_wp_offset(zone);
+       return 0;
+}
+
+static void nvme_update_wp_offset_workfn(struct work_struct *work)
+{
+       struct nvme_za_emul *za_emul;
+       struct nvme_ns *ns;
+       unsigned int zno;
+       unsigned long flags;
+       struct nvme_zone_report *report;
+       int buflen, ret;
+
+       buflen = sizeof(struct nvme_zone_report) +
+                                  sizeof(struct nvme_zone_descriptor);
+       za_emul = container_of(work, struct nvme_za_emul, zone_wp_offset_work);
+       ns = za_emul->ns;
+
+       spin_lock_irqsave(&za_emul->zones_wp_offset_lock, flags);
+
+       for (zno = 0; zno < za_emul->nr_zones; zno++) {
+               if (za_emul->zones_wp_offset[zno] != ZNS_UPDATING_WP_OFST)
+                       continue;
+               spin_unlock_irqrestore(&za_emul->zones_wp_offset_lock, flags);
+
+               report = (struct nvme_zone_report *)za_emul->zone_wp_update_buf;
+               memset(report, 0, buflen);
+               ret = __nvme_ns_report_zones(ns, (zno * ns->zsze),
+                                            report,
+                                            buflen);
+
+               spin_lock_irqsave(&za_emul->zones_wp_offset_lock, flags);
+               if (ret > 0)
+                       nvme_zone_parse_entry(ns, &report->entries[0],
+                                           zno, nvme_update_wp_offset_cb,
+                                           za_emul);
+       }
+       spin_unlock_irqrestore(&za_emul->zones_wp_offset_lock, flags);
+       /* remove the reference obtained earlier */
+       nvme_put_ns(ns);
+}
+
+blk_status_t nvme_append_to_write(struct nvme_ns *ns, struct request *req,
+                                 struct nvme_command *cmd)
+{
+       blk_status_t ret = 0;
+       struct nvme_za_emul *za_emul = ns->za_emul;
+       unsigned int nr_sectors = (blk_rq_bytes(req) >> SECTOR_SHIFT);
+       unsigned int wp_offset, zno = blk_rq_zone_no(req);
+       sector_t lba = blk_rq_pos(req);
+       unsigned long flags;
+
+       if (!blk_req_zone_write_trylock(req))
+               return BLK_STS_RESOURCE;
+
+       spin_lock_irqsave(&za_emul->zones_wp_offset_lock, flags);
+       wp_offset = za_emul->zones_wp_offset[zno];
+       switch (wp_offset) {
+       case ZNS_INVALID_WP_OFST:
+               /*
+                * update zone wp-offset in a deferred worker.
+                * postpone processing current request until worker manages
+                * to refresh wp by querying from device.
+                */
+               kref_get(&ns->kref);
+               za_emul->zones_wp_offset[zno] = ZNS_UPDATING_WP_OFST;
+               queue_work(nvme_wq, &za_emul->zone_wp_offset_work);
+               fallthrough;
+       case ZNS_UPDATING_WP_OFST:
+               ret = BLK_STS_RESOURCE;
+               break;
+       default:
+               if (wp_offset + nr_sectors > ns->zsze) {
+                       ret = BLK_STS_IOERR;
+                       break;
+               }
+               lba += wp_offset;
+       }
+       spin_unlock_irqrestore(&za_emul->zones_wp_offset_lock, flags);
+       /* unlock zone in case of error, update lba otherwise */
+       if (ret)
+               blk_req_zone_write_unlock(req);
+       else
+               cmd->rw.slba = cpu_to_le64(nvme_sect_to_lba(ns, lba));
+       return ret;
+}
+
+bool nvme_need_zone_wp_update(struct request *rq)
+{
+       switch (req_op(rq)) {
+       case REQ_OP_ZONE_APPEND:
+       case REQ_OP_ZONE_FINISH:
+       case REQ_OP_ZONE_RESET:
+       case REQ_OP_ZONE_RESET_ALL:
+               return true;
+       case REQ_OP_WRITE:
+       case REQ_OP_WRITE_ZEROES:
+       case REQ_OP_WRITE_SAME:
+               return blk_rq_zone_is_seq(rq);
+       default:
+               return false;
+       }
+}
+
+void nvme_zone_wp_update(struct nvme_ns *ns, struct request *rq,
+                        blk_status_t status)
+{
+       struct nvme_za_emul *za_emul = ns->za_emul;
+       unsigned long flags;
+       unsigned int zno = blk_rq_zone_no(rq);
+       enum req_opf op = req_op(rq);
+       unsigned int res_bytes = blk_rq_bytes(rq);
+
+       spin_lock_irqsave(&za_emul->zones_wp_offset_lock, flags);
+       /*
+        * Failure handling first, mark wp_offset invalid.
+        * This will force updating wp from device on subsequent access
+        */
+       if (status) {
+               if (op != REQ_OP_ZONE_RESET_ALL) {
+                       if (za_emul->zones_wp_offset[zno] !=
+                                       ZNS_UPDATING_WP_OFST)
+                               za_emul->zones_wp_offset[zno] = 
ZNS_INVALID_WP_OFST;
+
+               } else
+                       memset(za_emul->zones_wp_offset, ZNS_INVALID_WP_OFST,
+                               za_emul->nr_zones * sizeof(unsigned int));
+               goto unlock;
+       }
+       /* success case handling, update wp-offset */
+       switch (op) {
+       case REQ_OP_ZONE_APPEND:
+               rq->__sector += za_emul->zones_wp_offset[zno];
+               fallthrough;
+       case REQ_OP_WRITE_ZEROES:
+       case REQ_OP_WRITE_SAME:
+       case REQ_OP_WRITE:
+               /* every write should update the wp_offset */
+               if (za_emul->zones_wp_offset[zno] < ns->zsze)
+                       za_emul->zones_wp_offset[zno] +=
+                                               res_bytes >> SECTOR_SHIFT;
+               break;
+       case REQ_OP_ZONE_RESET:
+               za_emul->zones_wp_offset[zno] = 0;
+               break;
+       case REQ_OP_ZONE_FINISH:
+               za_emul->zones_wp_offset[zno] = ns->zsze;
+               break;
+       case REQ_OP_ZONE_RESET_ALL:
+               memset(za_emul->zones_wp_offset, 0,
+                      za_emul->nr_zones * sizeof(unsigned int));
+               break;
+       default:
+               break;
+       }
+unlock:
+       spin_unlock_irqrestore(&za_emul->zones_wp_offset_lock, flags);
+       /* release zone write-lock for append */
+       if (op == REQ_OP_ZONE_APPEND)
+               blk_req_zone_write_unlock(rq);
+}
+
+int nvme_setup_append_emulate(struct nvme_ns *ns)
+{
+       struct nvme_za_emul *za_emul;
+       size_t bufsize;
+
+       WARN_ON(ns->za_emul);
+       za_emul = kmalloc(sizeof(struct nvme_za_emul), GFP_KERNEL);
+       if (!za_emul)
+               return -ENOMEM;
+
+       za_emul->zones_wp_offset = NULL;
+       spin_lock_init(&za_emul->zones_wp_offset_lock);
+       za_emul->rev_wp_offset = NULL;
+       mutex_init(&za_emul->rev_mutex);
+       INIT_WORK(&za_emul->zone_wp_offset_work, nvme_update_wp_offset_workfn);
+       /* preallocate buffer for single zone-report */
+       bufsize = sizeof(struct nvme_zone_report) +
+                       sizeof(struct nvme_zone_descriptor);
+       za_emul->zone_wp_update_buf = kzalloc(bufsize, GFP_KERNEL);
+       if (!za_emul->zone_wp_update_buf) {
+               kfree(za_emul);
+               return -ENOMEM;
+       }
+       za_emul->nr_zones = get_capacity(ns->disk) >> ilog2(ns->zsze);
+
+       ns->za_emul = za_emul;
+       za_emul->ns = ns;
+
+       return 0;
+}
+
+void nvme_teardown_append_emulate(struct nvme_ns *ns)
+{
+       WARN_ON(!ns->za_emul);
+       kvfree(ns->za_emul->zones_wp_offset);
+       kfree(ns->za_emul->zone_wp_update_buf);
+       ns->za_emul->zones_wp_offset = NULL;
+       ns->za_emul->rev_wp_offset = NULL;
+       kfree(ns->za_emul);
+}
-- 
2.17.1

Reply via email to