[RFC PATCH 4/5] nvme: LightNVM integration

Matias Bjørling Tue, 18 Nov 2014 11:43:40 -0800

NVMe devices are identified by the vendor specific bits:

Bit 3 in OACS (device-wide). Currently made per device, as the nvme
namespace is missing in the completion path. This is _not_ to be kept
and only added temponarily. Only added to hint blk-mq that it should
reserve space in the per-request private data field for LightNVM.


Bit 1 in DSM (per-namespace).

>From there, the NVMe specification is extended with the following
commands:

  LightNVM Identify
  LightNVM Get Features
  LightNVM Set Responsibility
  LightNVM Synchronious/Asynchronious erase
  LightNVM Get Logical to Physical map

The NVMe integration can be tested using Keith Busch NVMe qemu simulator
with LightNVM patches on top. This can be found at:

  https://github.com/OpenChannelSSD/qemu-nvme

Contributions in this patch from:

  Jesper Madsen <[email protected]>

Signed-off-by: Matias Bjørling <[email protected]>
---
 drivers/block/nvme-core.c | 187 +++++++++++++++++++++++++++++++++++++++++++++-
 include/linux/nvme.h      |   1 +
 include/uapi/linux/nvme.h |  74 ++++++++++++++++++
 3 files changed, 261 insertions(+), 1 deletion(-)

diff --git a/drivers/block/nvme-core.c b/drivers/block/nvme-core.c
index 337878b..e012c02 100644
--- a/drivers/block/nvme-core.c
+++ b/drivers/block/nvme-core.c
@@ -38,6 +38,7 @@
 #include <linux/sched.h>
 #include <linux/slab.h>
 #include <linux/types.h>
+#include <linux/lightnvm.h>
 #include <scsi/sg.h>
 #include <asm-generic/io-64-nonatomic-lo-hi.h>
 
@@ -129,6 +130,7 @@ static inline void _nvme_check_size(void)
        BUILD_BUG_ON(sizeof(struct nvme_id_ns) != 4096);
        BUILD_BUG_ON(sizeof(struct nvme_lba_range_type) != 64);
        BUILD_BUG_ON(sizeof(struct nvme_smart_log) != 512);
+       BUILD_BUG_ON(sizeof(struct nvme_lnvm_rw_command) != 64);
 }
 
 typedef void (*nvme_completion_fn)(struct nvme_queue *, void *,
@@ -560,6 +562,9 @@ static int nvme_submit_iod(struct nvme_queue *nvmeq, struct 
nvme_iod *iod,
        cmnd->rw.control = cpu_to_le16(control);
        cmnd->rw.dsmgmt = cpu_to_le32(dsmgmt);
 
+       if (req->cmd_flags & REQ_NVM_MAPPED)
+               cmnd->lnvm_rw.phys_addr = cpu_to_le64(req->phys_sector + 1);
+
        if (++nvmeq->sq_tail == nvmeq->q_depth)
                nvmeq->sq_tail = 0;
        writel(nvmeq->sq_tail, nvmeq->q_db);
@@ -576,6 +581,7 @@ static int nvme_queue_rq(struct blk_mq_hw_ctx *hctx, struct 
request *req)
        enum dma_data_direction dma_dir;
        int psegs = req->nr_phys_segments;
        int result = BLK_MQ_RQ_QUEUE_BUSY;
+
        /*
         * Requeued IO has already been prepped
         */
@@ -895,6 +901,43 @@ static int adapter_delete_sq(struct nvme_dev *dev, u16 
sqid)
        return adapter_delete_queue(dev, nvme_admin_delete_sq, sqid);
 }
 
+int lnvm_identify(struct nvme_dev *dev, u32 chnl_off, dma_addr_t dma_addr)
+{
+       struct nvme_command c;
+
+       memset(&c, 0, sizeof(c));
+       c.common.opcode = lnvm_admin_identify;
+       c.common.nsid = cpu_to_le32(chnl_off);
+       c.common.prp1 = cpu_to_le64(dma_addr);
+
+       return nvme_submit_admin_cmd(dev, &c, NULL);
+}
+
+int lnvm_get_features(struct nvme_dev *dev, unsigned nsid, dma_addr_t dma_addr)
+{
+       struct nvme_command c;
+
+       memset(&c, 0, sizeof(c));
+       c.common.opcode = lnvm_admin_get_features;
+       c.common.nsid = cpu_to_le32(nsid);
+       c.common.prp1 = cpu_to_le64(dma_addr);
+
+       return nvme_submit_admin_cmd(dev, &c, NULL);
+}
+
+int lnvm_set_responsibility(struct nvme_dev *dev, unsigned nsid,
+                                                       dma_addr_t dma_addr)
+{
+       struct nvme_command c;
+
+       memset(&c, 0, sizeof(c));
+       c.common.opcode = lnvm_admin_set_responsibility;
+       c.common.nsid = cpu_to_le32(nsid);
+       c.common.prp1 = cpu_to_le64(dma_addr);
+
+       return nvme_submit_admin_cmd(dev, &c, NULL);
+}
+
 int nvme_identify(struct nvme_dev *dev, unsigned nsid, unsigned cns,
                                                        dma_addr_t dma_addr)
 {
@@ -1282,6 +1325,99 @@ static int nvme_shutdown_ctrl(struct nvme_dev *dev)
        return 0;
 }
 
+static int init_chnls(struct nvme_dev *dev, struct nvm_id *nvm_id,
+                       struct nvme_lnvm_id *dma_buf, dma_addr_t dma_addr)
+{
+       struct nvme_lnvm_id_chnl *src = dma_buf->chnls;
+       struct nvm_id_chnl *dst = nvm_id->chnls;
+       unsigned int len = nvm_id->nchannels;
+       int i, end, off = 0;
+
+       while (len) {
+               end = min_t(u32, NVME_LNVM_CHNLS_PR_REQ, len);
+
+               for (i = 0; i < end; i++, dst++, src++) {
+                       dst->queue_size = le64_to_cpu(src->queue_size);
+                       dst->gran_read = le64_to_cpu(src->gran_read);
+                       dst->gran_write = le64_to_cpu(src->gran_write);
+                       dst->gran_erase = le64_to_cpu(src->gran_erase);
+                       dst->oob_size = le64_to_cpu(src->oob_size);
+                       dst->t_r = le32_to_cpu(src->t_r);
+                       dst->t_sqr = le32_to_cpu(src->t_sqr);
+                       dst->t_w = le32_to_cpu(src->t_w);
+                       dst->t_sqw = le32_to_cpu(src->t_sqw);
+                       dst->t_e = le32_to_cpu(src->t_e);
+                       dst->io_sched = src->io_sched;
+                       dst->laddr_begin = le64_to_cpu(src->laddr_begin);
+                       dst->laddr_end = le64_to_cpu(src->laddr_end);
+               }
+
+               len -= end;
+               if (!len)
+                       break;
+
+               off += end;
+
+               if (lnvm_identify(dev, off, dma_addr))
+                       return -EIO;
+
+               src = dma_buf->chnls;
+       }
+       return 0;
+}
+
+static int nvme_nvm_id(struct request_queue *q, struct nvm_id *nvm_id)
+{
+       struct nvme_ns *ns = q->queuedata;
+       struct nvme_dev *dev = ns->dev;
+       struct pci_dev *pdev = dev->pci_dev;
+       struct nvme_lnvm_id *ctrl;
+       dma_addr_t dma_addr;
+       unsigned int ret;
+
+       ctrl = dma_alloc_coherent(&pdev->dev, 4096, &dma_addr, GFP_KERNEL);
+       if (!ctrl)
+               return -ENOMEM;
+
+       ret = lnvm_identify(dev, 0, dma_addr);
+       if (ret) {
+               ret = -EIO;
+               goto out;
+       }
+
+       nvm_id->ver_id = le16_to_cpu(ctrl->ver_id);
+       nvm_id->nvm_type = ctrl->nvm_type;
+       nvm_id->nchannels = le16_to_cpu(ctrl->nchannels);
+
+       if (!nvm_id->chnls)
+               nvm_id->chnls = kmalloc(sizeof(struct nvm_id_chnl)
+                                       * nvm_id->nchannels, GFP_KERNEL);
+
+       if (!nvm_id->chnls) {
+               ret = -ENOMEM;
+               goto out;
+       }
+
+       ret = init_chnls(dev, nvm_id, ctrl, dma_addr);
+out:
+       dma_free_coherent(&pdev->dev, 4096, ctrl, dma_addr);
+       return ret;
+}
+
+static int nvme_nvm_get_features(struct request_queue *q,
+                                               struct nvm_get_features *gf)
+{
+       gf->rsp[0] = (1 << NVM_RSP_L2P);
+       gf->rsp[0] |= (1 << NVM_RSP_P2L);
+       gf->rsp[0] |= (1 << NVM_RSP_GC);
+       return 0;
+}
+
+static int nvme_nvm_set_rsp(struct request_queue *q, u8 rsp, u8 val)
+{
+       return NVM_RID_NOT_CHANGEABLE | NVM_DNR;
+}
+
 static struct blk_mq_ops nvme_mq_admin_ops = {
        .queue_rq       = nvme_admin_queue_rq,
        .map_queue      = blk_mq_map_queue,
@@ -1290,6 +1426,12 @@ static struct blk_mq_ops nvme_mq_admin_ops = {
        .timeout        = nvme_timeout,
 };
 
+static struct lightnvm_dev_ops nvme_nvm_dev_ops = {
+       .identify               = nvme_nvm_id,
+       .get_features           = nvme_nvm_get_features,
+       .set_responsibility     = nvme_nvm_set_rsp,
+};
+
 static struct blk_mq_ops nvme_mq_ops = {
        .queue_rq       = nvme_queue_rq,
        .map_queue      = blk_mq_map_queue,
@@ -1455,6 +1597,26 @@ void nvme_unmap_user_pages(struct nvme_dev *dev, int 
write,
                put_page(sg_page(&iod->sg[i]));
 }
 
+static int nvme_nvm_submit_io(struct nvme_ns *ns, struct nvme_user_io *io)
+{
+       struct nvme_command c;
+       struct nvme_dev *dev = ns->dev;
+
+       memset(&c, 0, sizeof(c));
+       c.rw.opcode = io->opcode;
+       c.rw.flags = io->flags;
+       c.rw.nsid = cpu_to_le32(ns->ns_id);
+       c.rw.slba = cpu_to_le64(io->slba);
+       c.rw.length = cpu_to_le16(io->nblocks);
+       c.rw.control = cpu_to_le16(io->control);
+       c.rw.dsmgmt = cpu_to_le32(io->dsmgmt);
+       c.rw.reftag = cpu_to_le32(io->reftag);
+       c.rw.apptag = cpu_to_le16(io->apptag);
+       c.rw.appmask = cpu_to_le16(io->appmask);
+
+       return nvme_submit_io_cmd(dev, ns, &c, NULL);
+}
+
 static int nvme_submit_io(struct nvme_ns *ns, struct nvme_user_io __user *uio)
 {
        struct nvme_dev *dev = ns->dev;
@@ -1480,6 +1642,10 @@ static int nvme_submit_io(struct nvme_ns *ns, struct 
nvme_user_io __user *uio)
        case nvme_cmd_compare:
                iod = nvme_map_user_pages(dev, io.opcode & 1, io.addr, length);
                break;
+       case lnvm_admin_identify:
+       case lnvm_admin_get_features:
+       case lnvm_admin_set_responsibility:
+               return nvme_nvm_submit_io(ns, &io);
        default:
                return -EINVAL;
        }
@@ -1769,7 +1935,6 @@ static struct nvme_ns *nvme_alloc_ns(struct nvme_dev 
*dev, unsigned nsid,
        ns->queue = blk_mq_init_queue(&dev->tagset);
        if (!ns->queue)
                goto out_free_ns;
-       queue_flag_set_unlocked(QUEUE_FLAG_DEFAULT, ns->queue);
        queue_flag_set_unlocked(QUEUE_FLAG_NOMERGES, ns->queue);
        queue_flag_set_unlocked(QUEUE_FLAG_NONROT, ns->queue);
        queue_flag_set_unlocked(QUEUE_FLAG_SG_GAPS, ns->queue);
@@ -1807,8 +1972,18 @@ static struct nvme_ns *nvme_alloc_ns(struct nvme_dev 
*dev, unsigned nsid,
        if (dev->oncs & NVME_CTRL_ONCS_DSM)
                nvme_config_discard(ns);
 
+       if (id->nsfeat & NVME_NS_FEAT_LIGHTNVM) {
+               if (blk_lightnvm_register(ns->queue, &nvme_nvm_dev_ops))
+                       goto out_put_disk;
+
+               /* FIXME: This will be handled later by ns */
+               ns->queue->nvm->drv_cmd_size = sizeof(struct nvme_cmd_info);
+       }
+
        return ns;
 
+ out_put_disk:
+       put_disk(disk);
  out_free_queue:
        blk_cleanup_queue(ns->queue);
  out_free_ns:
@@ -1954,6 +2129,7 @@ static int nvme_dev_add(struct nvme_dev *dev)
        ctrl = mem;
        nn = le32_to_cpup(&ctrl->nn);
        dev->oncs = le16_to_cpup(&ctrl->oncs);
+       dev->oacs = le16_to_cpup(&ctrl->oacs);
        dev->abort_limit = ctrl->acl + 1;
        dev->vwc = ctrl->vwc;
        memcpy(dev->serial, ctrl->sn, sizeof(ctrl->sn));
@@ -1983,6 +2159,15 @@ static int nvme_dev_add(struct nvme_dev *dev)
        dev->tagset.flags = BLK_MQ_F_SHOULD_MERGE;
        dev->tagset.driver_data = dev;
 
+       /* LightNVM is actually per ns, but as the tagset is defined with a set
+        * of operations for the whole device. It currently is either all or
+        * no lightnvm compatible name-spaces for a given device.
+        */
+       if (dev->oacs & NVME_CTRL_OACS_LIGHTNVM) {
+               dev->tagset.flags &= ~BLK_MQ_F_SHOULD_MERGE;
+               dev->tagset.flags |= BLK_MQ_F_LIGHTNVM;
+       }
+
        if (blk_mq_alloc_tag_set(&dev->tagset))
                goto out;
 
diff --git a/include/linux/nvme.h b/include/linux/nvme.h
index 299e6f5..89aed50 100644
--- a/include/linux/nvme.h
+++ b/include/linux/nvme.h
@@ -100,6 +100,7 @@ struct nvme_dev {
        u32 max_hw_sectors;
        u32 stripe_size;
        u16 oncs;
+       u16 oacs;
        u16 abort_limit;
        u8 vwc;
        u8 initialized;
diff --git a/include/uapi/linux/nvme.h b/include/uapi/linux/nvme.h
index 29a7d86..c3d1e9a 100644
--- a/include/uapi/linux/nvme.h
+++ b/include/uapi/linux/nvme.h
@@ -85,6 +85,35 @@ struct nvme_id_ctrl {
        __u8                    vs[1024];
 };
 
+struct nvme_lnvm_id_chnl {
+       __le64                  laddr_begin;
+       __le64                  laddr_end;
+       __le32                  oob_size;
+       __le32                  queue_size;
+       __le32                  gran_read;
+       __le32                  gran_write;
+       __le32                  gran_erase;
+       __le32                  t_r;
+       __le32                  t_sqr;
+       __le32                  t_w;
+       __le32                  t_sqw;
+       __le32                  t_e;
+       __le16                  chnl_parallelism;
+       __u8                    io_sched;
+       __u8                    reserved[133];
+} __attribute__((packed));
+
+struct nvme_lnvm_id {
+       __u8                            ver_id;
+       __u8                            nvm_type;
+       __le16                          nchannels;
+       __u8                            reserved[252];
+       struct nvme_lnvm_id_chnl        chnls[];
+} __attribute__((packed));
+
+#define NVME_LNVM_CHNLS_PR_REQ ((4096U - sizeof(struct nvme_lnvm_id)) \
+                                       / sizeof(struct nvme_lnvm_id_chnl))
+
 enum {
        NVME_CTRL_ONCS_COMPARE                  = 1 << 0,
        NVME_CTRL_ONCS_WRITE_UNCORRECTABLE      = 1 << 1,
@@ -123,7 +152,12 @@ struct nvme_id_ns {
 };
 
 enum {
+       NVME_CTRL_OACS_LIGHTNVM = 1 << 3,
+};
+
+enum {
        NVME_NS_FEAT_THIN       = 1 << 0,
+       NVME_NS_FEAT_LIGHTNVM   = 1 << 1,
        NVME_LBAF_RP_BEST       = 0,
        NVME_LBAF_RP_BETTER     = 1,
        NVME_LBAF_RP_GOOD       = 2,
@@ -192,6 +226,11 @@ enum nvme_opcode {
        nvme_cmd_dsm            = 0x09,
 };
 
+enum lnvme_opcode {
+       lnvme_cmd_erase_sync    = 0x80,
+       lnvme_cmd_erase_async   = 0x81,
+};
+
 struct nvme_common_command {
        __u8                    opcode;
        __u8                    flags;
@@ -222,6 +261,22 @@ struct nvme_rw_command {
        __le16                  appmask;
 };
 
+struct nvme_lnvm_rw_command {
+       __u8                    opcode;
+       __u8                    flags;
+       __u16                   command_id;
+       __le32                  nsid;
+       __u64                   rsvd2;
+       __le64                  metadata;
+       __le64                  prp1;
+       __le64                  prp2;
+       __le64                  slba;
+       __le16                  length;
+       __le16                  control;
+       __le32                  dsmgmt;
+       __le64                  phys_addr;
+};
+
 enum {
        NVME_RW_LR                      = 1 << 15,
        NVME_RW_FUA                     = 1 << 14,
@@ -285,6 +340,11 @@ enum nvme_admin_opcode {
        nvme_admin_format_nvm           = 0x80,
        nvme_admin_security_send        = 0x81,
        nvme_admin_security_recv        = 0x82,
+
+       lnvm_admin_identify             = 0xc0,
+       lnvm_admin_get_features         = 0xc1,
+       lnvm_admin_set_responsibility   = 0xc2,
+       lnvm_admin_get_l2p_tbl          = 0xc3,
 };
 
 enum {
@@ -410,6 +470,18 @@ struct nvme_format_cmd {
        __u32                   rsvd11[5];
 };
 
+struct nvme_lnvm_identify {
+       __u8                    opcode;
+       __u8                    flags;
+       __u16                   command_id;
+       __le32                  nsid;
+       __u64                   rsvd[2];
+       __le64                  prp1;
+       __le64                  prp2;
+       __le32                  cns;
+       __u32                   rsvd11[5];
+};
+
 struct nvme_command {
        union {
                struct nvme_common_command common;
@@ -423,6 +495,8 @@ struct nvme_command {
                struct nvme_format_cmd format;
                struct nvme_dsm_cmd dsm;
                struct nvme_abort_cmd abort;
+               struct nvme_lnvm_identify lnvm_identify;
+               struct nvme_lnvm_rw_command lnvm_rw;
        };
 };
 
-- 
1.9.1

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to [email protected]
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

[RFC PATCH 4/5] nvme: LightNVM integration

Reply via email to