Signed-off-by: Ming Lin <min...@ssi.samsung.com>
---
 drivers/block/Kconfig            |   7 +
 drivers/block/Makefile           |   1 +
 drivers/block/nvme-core.c        |   1 +
 drivers/block/virtio_nvme.c      | 853 +++++++++++++++++++++++++++++++++++++++
 include/linux/virtio_nvme.h      |  53 +++
 include/uapi/linux/virtio_ids.h  |   1 +
 include/uapi/linux/virtio_nvme.h |  30 ++
 7 files changed, 946 insertions(+)
 create mode 100644 drivers/block/virtio_nvme.c
 create mode 100644 include/linux/virtio_nvme.h
 create mode 100644 include/uapi/linux/virtio_nvme.h

diff --git a/drivers/block/Kconfig b/drivers/block/Kconfig
index 1b8094d..7149885 100644
--- a/drivers/block/Kconfig
+++ b/drivers/block/Kconfig
@@ -519,6 +519,13 @@ config VIRTIO_BLK
          This is the virtual block driver for virtio.  It can be used with
           lguest or QEMU based VMMs (like KVM or Xen).  Say Y or M.
 
+config VIRTIO_NVME
+       tristate "Virtio NVMe driver"
+       depends on VIRTIO
+       ---help---
+         This is the virtual NVMe driver for virtio.  It can be used with
+          lguest or QEMU based VMMs (like KVM or Xen).  Say Y or M.
+
 config BLK_DEV_HD
        bool "Very old hard disk (MFM/RLL/IDE) driver"
        depends on HAVE_IDE
diff --git a/drivers/block/Makefile b/drivers/block/Makefile
index 02b688d..3b73f59 100644
--- a/drivers/block/Makefile
+++ b/drivers/block/Makefile
@@ -30,6 +30,7 @@ obj-$(CONFIG_BLK_DEV_UMEM)    += umem.o
 obj-$(CONFIG_BLK_DEV_NBD)      += nbd.o
 obj-$(CONFIG_BLK_DEV_CRYPTOLOOP) += cryptoloop.o
 obj-$(CONFIG_VIRTIO_BLK)       += virtio_blk.o
+obj-$(CONFIG_VIRTIO_NVME)      += virtio_nvme.o
 
 obj-$(CONFIG_BLK_DEV_SX8)      += sx8.o
 obj-$(CONFIG_BLK_DEV_HD)       += hd.o
diff --git a/drivers/block/nvme-core.c b/drivers/block/nvme-core.c
index 7920c27..7895606 100644
--- a/drivers/block/nvme-core.c
+++ b/drivers/block/nvme-core.c
@@ -1059,6 +1059,7 @@ int nvme_submit_sync_cmd(struct request_queue *q, struct 
nvme_command *cmd,
 {
        return __nvme_submit_sync_cmd(q, cmd, buffer, NULL, bufflen, NULL, 0);
 }
+EXPORT_SYMBOL_GPL(nvme_submit_sync_cmd);
 
 static int nvme_submit_async_admin_req(struct nvme_dev *dev)
 {
diff --git a/drivers/block/virtio_nvme.c b/drivers/block/virtio_nvme.c
new file mode 100644
index 0000000..57f81fc
--- /dev/null
+++ b/drivers/block/virtio_nvme.c
@@ -0,0 +1,853 @@
+/* Modified from virtio_blk.c and nvme-core.c */
+
+#include <linux/spinlock.h>
+#include <linux/slab.h>
+#include <linux/blkdev.h>
+#include <linux/hdreg.h>
+#include <linux/module.h>
+#include <linux/mutex.h>
+#include <linux/delay.h>
+#include <linux/virtio.h>
+#include <linux/virtio_nvme.h>
+#include <linux/scatterlist.h>
+#include <linux/string_helpers.h>
+#include <linux/idr.h>
+#include <linux/blk-mq.h>
+#include <linux/numa.h>
+#include <linux/virtio_nvme.h>
+#include <linux/nvme.h>
+#include <linux/blk-mq.h>
+
+#define ADMIN_TIMEOUT           (2 * HZ)
+#define NVME_AQ_DEPTH          256
+
+static int virtnvme_major;
+module_param(virtnvme_major, int, 0);
+
+static unsigned int virtnvme_queue_depth;
+module_param_named(queue_depth, virtnvme_queue_depth, uint, 0444);
+
+static DEFINE_SPINLOCK(dev_list_lock);
+static LIST_HEAD(dev_list);
+
+static void virtnvme_free_namespaces(struct virtio_nvme_dev *dev);
+
+static const struct virtio_device_id id_table[] = {
+       { VIRTIO_ID_NVME, VIRTIO_DEV_ANY_ID },
+       { 0 },
+};
+
+struct virtnvme_req
+{
+       struct request *req;
+       struct nvme_command cmd;
+       struct virtio_nvme_resp resp;
+       struct scatterlist sg[];
+};
+
+static int virtnvme_identify_ctrl(struct virtio_nvme_dev *dev, struct 
nvme_id_ctrl **id)
+{
+       struct nvme_command c = { };
+       int error;
+
+       /* gcc-4.4.4 (at least) has issues with initializers and anon unions */
+       c.identify.opcode = nvme_admin_identify;
+       c.identify.cns = cpu_to_le32(1);
+
+       *id = kmalloc(sizeof(struct nvme_id_ctrl), GFP_KERNEL);
+       if (!*id)
+               return -ENOMEM;
+
+       error = nvme_submit_sync_cmd(dev->admin_q, &c, *id,
+                       sizeof(struct nvme_id_ctrl));
+       if (error)
+               kfree(*id);
+       return error;
+}
+
+static int virtnvme_identify_ns(struct virtio_nvme_dev *dev, unsigned nsid,
+               struct nvme_id_ns **id)
+{
+       struct nvme_command c = { };
+       int error;
+
+       /* gcc-4.4.4 (at least) has issues with initializers and anon unions */
+       c.identify.opcode = nvme_admin_identify,
+       c.identify.nsid = cpu_to_le32(nsid),
+
+       *id = kmalloc(sizeof(struct nvme_id_ns), GFP_KERNEL);
+       if (!*id)
+               return -ENOMEM;
+
+       error = nvme_submit_sync_cmd(dev->admin_q, &c, *id,
+                       sizeof(struct nvme_id_ns));
+       if (error)
+               kfree(*id);
+       return error;
+}
+
+static int virtnvme_wait_ready(struct virtio_nvme_dev *dev, u64 cap)
+{
+       struct virtio_device *vdev = dev->vdev;
+       unsigned long timeout;
+       u32 csts;
+
+       timeout = ((NVME_CAP_TIMEOUT(cap) + 1) * HZ / 2) + jiffies;
+
+       while (1) {
+               virtio_cread(vdev, struct virtio_nvme_config, csts, &csts);
+               if ((csts & NVME_CSTS_RDY) == NVME_CSTS_RDY)
+                       break;
+
+               msleep(100);
+               if (fatal_signal_pending(current))
+                       return -EINTR;
+               if (time_after(jiffies, timeout)) {
+                       printk("Device not ready; aborting initialisation\n");
+                       return -ENODEV;
+               }
+       }
+
+       return 0;
+}
+
+static void virtnvme_admin_done(struct virtqueue *vq)
+{
+       struct virtio_nvme_dev *dev = vq->vdev->priv;
+       struct virtnvme_req *vnr;
+       int qid = vq->index;
+       unsigned long flags;
+       unsigned int len;
+
+       spin_lock_irqsave(&dev->vqs[qid].lock, flags);
+       do {
+               virtqueue_disable_cb(vq);
+               while ((vnr = virtqueue_get_buf(dev->vqs[qid].vq, &len)) != 
NULL)
+                       blk_mq_complete_request(vnr->req);
+               if (unlikely(virtqueue_is_broken(vq)))
+                       break;
+       } while (!virtqueue_enable_cb(vq));
+
+       spin_unlock_irqrestore(&dev->vqs[qid].lock, flags);
+}
+
+static void virtnvme_io_done(struct virtqueue *vq)
+{
+       struct virtio_nvme_dev *dev = vq->vdev->priv;
+       int qid = vq->index;
+       struct virtnvme_req *vnr;
+       unsigned long flags;
+       unsigned int len;
+       bool bio_done = false;
+
+       spin_lock_irqsave(&dev->vqs[qid].lock, flags);
+       do {
+               virtqueue_disable_cb(vq);
+               while ((vnr = virtqueue_get_buf(dev->vqs[qid].vq, &len)) != 
NULL) {
+                       blk_mq_complete_request(vnr->req);
+                       bio_done = true;
+               }
+
+               if (unlikely(virtqueue_is_broken(vq)))
+                       break;
+       } while (!virtqueue_enable_cb(vq));
+
+       spin_unlock_irqrestore(&dev->vqs[qid].lock, flags);
+
+       if (bio_done)
+               wake_up(&dev->queue_wait);
+}
+
+static int virtnvme_init_vq(struct virtio_nvme_dev *dev)
+{
+       int err = 0;
+       int i;
+       vq_callback_t **callbacks;
+       const char **names;
+       struct virtqueue **vqs;
+       unsigned num_vqs;
+       struct virtio_device *vdev = dev->vdev;
+
+       err = virtio_cread_feature(vdev, VIRTIO_NVME_F_MQ,
+                                  struct virtio_nvme_config, num_queues,
+                                  &num_vqs);
+       if (err)
+               num_vqs = 1;
+
+       num_vqs++;
+
+       dev->vqs = kmalloc(sizeof(*dev->vqs) * num_vqs, GFP_KERNEL);
+       if (!dev->vqs) {
+               err = -ENOMEM;
+               goto out;
+       }
+
+       names = kmalloc(sizeof(*names) * num_vqs, GFP_KERNEL);
+       if (!names)
+               goto err_names;
+
+       callbacks = kmalloc(sizeof(*callbacks) * num_vqs, GFP_KERNEL);
+       if (!callbacks)
+               goto err_callbacks;
+
+       vqs = kmalloc(sizeof(*vqs) * num_vqs, GFP_KERNEL);
+       if (!vqs)
+               goto err_vqs;
+
+       callbacks[0] = virtnvme_admin_done;
+       names[0] = "admin";
+       dev->vqs[0].dev = dev;
+
+       for (i = 1; i < num_vqs; i++) {
+               callbacks[i] = virtnvme_io_done;
+               snprintf(dev->vqs[i].name, VQ_NAME_LEN, "req.%d", i);
+               names[i] = dev->vqs[i].name;
+               dev->vqs[i].dev = dev;
+       }
+
+       /* Discover virtqueues and write information to configuration.  */
+       err = vdev->config->find_vqs(vdev, num_vqs, vqs, callbacks, names);
+       if (err)
+               goto err_find_vqs;
+
+       for (i = 0; i < num_vqs; i++) {
+               spin_lock_init(&dev->vqs[i].lock);
+               dev->vqs[i].vq = vqs[i];
+       }
+       dev->num_vqs = num_vqs;
+
+err_find_vqs:
+       kfree(vqs);
+err_vqs:
+       kfree(callbacks);
+err_callbacks:
+       kfree(names);
+err_names:
+       if (err)
+               kfree(dev->vqs);
+out:
+       return err;
+}
+
+static inline struct virtnvme_req *virtnvme_alloc_req(struct virtio_nvme_dev 
*dev,
+               gfp_t gfp_mask)
+{
+       struct virtnvme_req *vnr;
+
+       vnr = kmalloc(sizeof(*vnr) + dev->sg_elems*sizeof(struct scatterlist),
+                       gfp_mask);
+       if (!vnr)
+               return NULL;
+
+       sg_init_table(vnr->sg, dev->sg_elems);
+
+       return vnr;
+}
+
+static inline u64 virtnvme_block_nr(struct virtio_nvme_ns *ns, sector_t sector)
+{
+        return (sector >> (ns->lba_shift - 9));
+}
+
+static int virtnvme_add_req(struct virtio_nvme_ns *ns, struct virtqueue *vq,
+                            struct virtnvme_req *vnr,
+                            struct scatterlist *data_sg,
+                            bool have_data)
+{
+       struct scatterlist cmd, resp, *sgs[5];
+       unsigned int num_out = 0, num_in = 0;
+
+       sg_init_one(&cmd, vnr->req->cmd, sizeof(struct nvme_command));
+       sgs[num_out++] = &cmd;
+
+       if (have_data) {
+               if (rq_data_dir(vnr->req))
+                       sgs[num_out++] = data_sg;
+               else
+                       sgs[num_out + num_in++] = data_sg;
+       }
+
+       sg_init_one(&resp, &vnr->resp, sizeof(struct virtio_nvme_resp));
+       sgs[num_out + num_in++] = &resp;
+
+       return virtqueue_add_sgs(vq, sgs, num_out, num_in, vnr, GFP_ATOMIC);
+}
+
+static int virtnvme_setup_io(struct virtnvme_req *vnr, struct virtio_nvme_ns 
*ns)
+{
+       struct nvme_command *cmnd;
+       struct request *req = vnr->req;
+       u16 control = 0;
+       u32 dsmgmt = 0;
+
+#if 0 /* TODO */
+       if (req->cmd_flags & REQ_FUA)
+               control |= NVME_RW_FUA;
+       if (req->cmd_flags & (REQ_FAILFAST_DEV | REQ_RAHEAD))
+               control |= NVME_RW_LR;
+
+       if (req->cmd_flags & REQ_RAHEAD)
+               dsmgmt |= NVME_RW_DSM_FREQ_PREFETCH;
+#endif
+
+       cmnd = &vnr->cmd;
+       req->cmd = (unsigned char *)cmnd;
+       req->cmd_len = sizeof(struct nvme_command);
+       memset(cmnd, 0, sizeof(*cmnd));
+
+       cmnd->rw.opcode = (rq_data_dir(req) ? nvme_cmd_write : nvme_cmd_read);
+       cmnd->rw.command_id = req->tag;
+       cmnd->rw.nsid = cpu_to_le32(ns->ns_id);
+       cmnd->rw.slba = cpu_to_le64(virtnvme_block_nr(ns, blk_rq_pos(req)));
+       cmnd->rw.length = cpu_to_le16((blk_rq_bytes(req) >> ns->lba_shift) - 1);
+       cmnd->rw.control = cpu_to_le16(control);
+       cmnd->rw.dsmgmt = cpu_to_le32(dsmgmt);
+
+       return 0;
+}
+
+static int virtnvme_queue_rq(struct blk_mq_hw_ctx *hctx,
+               const struct blk_mq_queue_data *bd)
+{
+       struct virtio_nvme_ns *ns = hctx->queue->queuedata;
+       struct virtio_nvme_queue *nvmeq = hctx->driver_data;
+       struct request *req = bd->rq;
+       struct virtnvme_req *vnr = blk_mq_rq_to_pdu(req);
+       unsigned long flags;
+       unsigned int num;
+       int err;
+       bool notify = false;
+
+       vnr->req = req;
+
+       if (req->cmd_type == REQ_TYPE_DRV_PRIV)
+               ; /* TODO: nvme_submit_priv(nvmeq, req, iod) */
+       else if (req->cmd_flags & REQ_DISCARD)
+               ; /* TODO: nvme_submit_discard(nvmeq, ns, req, iod) */
+       else if (req->cmd_flags & REQ_FLUSH)
+               ; /* TODO: nvme_submit_flush(nvmeq, ns, req->tag) */
+       else
+               virtnvme_setup_io(vnr, ns);
+
+       blk_mq_start_request(req);
+
+       num = blk_rq_map_sg(hctx->queue, vnr->req, vnr->sg);
+
+       spin_lock_irqsave(&nvmeq->lock, flags);
+       err = virtnvme_add_req(ns, nvmeq->vq, vnr, vnr->sg, num);
+       if (err) {
+               virtqueue_kick(nvmeq->vq);
+               blk_mq_stop_hw_queue(hctx);
+               spin_unlock_irqrestore(&nvmeq->lock, flags);
+               if (err == -ENOMEM || err == -ENOSPC)
+                       return BLK_MQ_RQ_QUEUE_BUSY;
+               return BLK_MQ_RQ_QUEUE_ERROR;
+       }
+
+       if (bd->last && virtqueue_kick_prepare(nvmeq->vq))
+               notify = true;
+       spin_unlock_irqrestore(&nvmeq->lock, flags);
+
+       if (notify)
+               virtqueue_notify(nvmeq->vq);
+       return BLK_MQ_RQ_QUEUE_OK;
+}
+
+static inline void virtnvme_request_done(struct request *req)
+{
+       struct virtnvme_req *vnr = blk_mq_rq_to_pdu(req);
+       int error = vnr->resp.status;
+
+#if 0 /* TODO */
+       if (req->cmd_type == REQ_TYPE_BLOCK_PC) {
+               req->resid_len = virtio32_to_cpu(dev->vdev, 
vbr->in_hdr.residual);
+               req->sense_len = virtio32_to_cpu(dev->vdev, 
vbr->in_hdr.sense_len);
+               req->errors = virtio32_to_cpu(dev->vdev, vbr->in_hdr.errors);
+       } else if (req->cmd_type == REQ_TYPE_DRV_PRIV) {
+               req->errors = (error != 0);
+       }
+#endif
+
+       blk_mq_end_request(req, error);
+}
+
+static int virtnvme_init_request(void *data, struct request *rq,
+               unsigned int hctx_idx, unsigned int request_idx,
+               unsigned int numa_node)
+{
+       struct virtio_nvme_dev *dev = data;
+       struct virtnvme_req *vnr = blk_mq_rq_to_pdu(rq);
+
+       sg_init_table(vnr->sg, dev->sg_elems);
+       return 0;
+}
+
+static int virtnvme_admin_init_hctx(struct blk_mq_hw_ctx *hctx, void *data,
+                               unsigned int hctx_idx)
+{
+       struct virtio_nvme_dev *dev = data;
+       struct virtio_nvme_queue *nvmeq = &dev->vqs[0];
+
+       hctx->driver_data = nvmeq;
+       return 0;
+}
+
+static int virtnvme_init_hctx(struct blk_mq_hw_ctx *hctx, void *data,
+                               unsigned int hctx_idx)
+{
+       struct virtio_nvme_dev *dev = data;
+       struct virtio_nvme_queue *nvmeq = &dev->vqs[hctx_idx+1];
+
+       hctx->driver_data = nvmeq;
+       return 0;
+}
+
+static struct blk_mq_ops virtio_nvme_mq_admin_ops = {
+       .queue_rq       = virtnvme_queue_rq,
+       .map_queue      = blk_mq_map_queue,
+       .init_hctx      = virtnvme_admin_init_hctx,
+       .complete       = virtnvme_request_done,
+       .init_request   = virtnvme_init_request,
+};
+
+static struct blk_mq_ops virtio_nvme_mq_ops = {
+       .queue_rq       = virtnvme_queue_rq,
+       .map_queue      = blk_mq_map_queue,
+       .init_hctx      = virtnvme_init_hctx,
+       .complete       = virtnvme_request_done,
+       .init_request   = virtnvme_init_request,
+};
+
+static int virtnvme_open(struct block_device *bdev, fmode_t mode)
+{
+       struct virtio_nvme_ns *ns = bdev->bd_disk->private_data;
+       struct virtio_nvme_dev *dev = ns->dev;
+
+       kref_get(&dev->kref);
+       return 0;
+}
+
+static DEFINE_IDA(nvme_instance_ida);
+
+static int nvme_set_instance(struct virtio_nvme_dev *dev)
+{
+       int instance, error;
+
+       do {
+               if (!ida_pre_get(&nvme_instance_ida, GFP_KERNEL))
+                       return -ENODEV;
+
+               spin_lock(&dev_list_lock);
+               error = ida_get_new(&nvme_instance_ida, &instance);
+               spin_unlock(&dev_list_lock);
+       } while (error == -EAGAIN);
+
+       if (error)
+               return -ENODEV;
+
+       dev->instance = instance;
+       return 0;
+}
+
+static void virtnvme_release_instance(struct virtio_nvme_dev *dev)
+{
+       spin_lock(&dev_list_lock);
+       ida_remove(&nvme_instance_ida, dev->instance);
+       spin_unlock(&dev_list_lock);
+}
+
+static void virtnvme_free_dev(struct kref *kref)
+{
+        struct virtio_nvme_dev *dev = container_of(kref,
+                                       struct virtio_nvme_dev, kref);
+
+        virtnvme_free_namespaces(dev);
+        virtnvme_release_instance(dev);
+       if (dev->tagset.tags)
+               blk_mq_free_tag_set(&dev->tagset);
+       if (dev->admin_q)
+               blk_put_queue(dev->admin_q);
+        kfree(dev);
+}
+
+static void virtnvme_release(struct gendisk *disk, fmode_t mode)
+{
+       struct virtio_nvme_ns *ns = disk->private_data;
+       struct virtio_nvme_dev *dev = ns->dev;
+
+       kref_put(&dev->kref, virtnvme_free_dev);
+}
+
+static const struct block_device_operations virtnvme_fops = {
+       .owner          = THIS_MODULE,
+       .open           = virtnvme_open,
+       .release        = virtnvme_release,
+};
+
+static struct virtio_nvme_ns *virtnvme_alloc_ns(struct virtio_nvme_dev *dev, 
unsigned nsid,
+                       struct nvme_id_ns *id)
+{
+       struct virtio_nvme_ns *ns;
+       struct gendisk *disk;
+       int lbaf;
+
+       ns = kzalloc(sizeof(*ns), GFP_KERNEL);
+       if (!ns)
+               return NULL;
+       ns->queue = blk_mq_init_queue(&dev->tagset);
+       if (!ns->queue)
+               goto out_free_ns;
+       ns->queue->queue_flags = QUEUE_FLAG_DEFAULT;
+       queue_flag_set_unlocked(QUEUE_FLAG_NOMERGES, ns->queue);
+       queue_flag_set_unlocked(QUEUE_FLAG_NONROT, ns->queue);
+       queue_flag_clear_unlocked(QUEUE_FLAG_ADD_RANDOM, ns->queue);
+       ns->dev = dev;
+       ns->queue->queuedata = ns;
+
+       disk = alloc_disk(0);
+       if (!disk)
+               goto out_free_queue;
+       ns->ns_id = nsid;
+       ns->disk = disk;
+       lbaf = id->flbas & 0xf;
+       ns->lba_shift = id->lbaf[lbaf].ds;
+       ns->ms = le16_to_cpu(id->lbaf[lbaf].ms);
+       blk_queue_logical_block_size(ns->queue, 1 << ns->lba_shift);
+       if (dev->max_hw_sectors)
+               blk_queue_max_hw_sectors(ns->queue, dev->max_hw_sectors);
+       disk->major = virtnvme_major;
+       disk->first_minor = 0;
+       disk->fops = &virtnvme_fops;
+       disk->private_data = ns;
+       disk->queue = ns->queue;
+       disk->flags = GENHD_FL_EXT_DEVT;
+       sprintf(disk->disk_name, "vnvme%dn%d", dev->instance, nsid);
+       set_capacity(disk, le64_to_cpup(&id->nsze) << (ns->lba_shift - 9));
+
+       return ns;
+
+out_free_queue:
+       blk_cleanup_queue(ns->queue);
+out_free_ns:
+       kfree(ns);
+       return NULL;
+}
+
+static unsigned int virtnvme_cmd_size(struct virtio_nvme_dev *dev)
+{
+       unsigned int ret;
+
+       ret = sizeof(struct virtnvme_req) +
+               sizeof(struct scatterlist) * dev->sg_elems;
+
+        return ret;
+}
+
+static int virtnvme_dev_add(struct virtio_nvme_dev *dev)
+{
+       int res;
+       unsigned nn, i;
+       struct virtio_nvme_ns *ns;
+       struct nvme_id_ctrl *ctrl;
+       struct nvme_id_ns *id_ns;
+       int err;
+
+       res = virtnvme_identify_ctrl(dev, &ctrl);
+       if (res) {
+               printk("Identify Controller failed (%d)\n", res);
+                res = -EIO;
+               goto out;
+       }
+
+       nn = le32_to_cpup(&ctrl->nn);
+
+       memset(&dev->tagset, 0, sizeof(dev->tagset));
+       dev->tagset.ops = &virtio_nvme_mq_ops;
+       /* Default queue sizing is to fill the ring. */
+       if (!virtnvme_queue_depth)
+               virtnvme_queue_depth = dev->vqs[1].vq->num_free;
+       dev->tagset.queue_depth = virtnvme_queue_depth;
+       dev->tagset.numa_node = NUMA_NO_NODE;
+       dev->tagset.flags = BLK_MQ_F_SHOULD_MERGE;
+       dev->tagset.cmd_size = virtnvme_cmd_size(dev);
+       dev->tagset.driver_data = dev;
+       dev->tagset.nr_hw_queues = dev->num_vqs - 1;
+
+       err = blk_mq_alloc_tag_set(&dev->tagset);
+       if (err)
+               goto out;
+
+       for (i = 1; i <= nn; i++) {
+               res = virtnvme_identify_ns(dev, i, &id_ns);
+               if (res)
+                       continue;
+
+               if (id_ns->ncap == 0)
+                       continue;
+
+               ns = virtnvme_alloc_ns(dev, i, id_ns);
+               if (ns)
+                       list_add_tail(&ns->list, &dev->namespaces);
+       }
+       list_for_each_entry(ns, &dev->namespaces, list)
+               add_disk(ns->disk);
+
+out:
+       return res;
+}
+
+static void virtnvme_dev_remove_admin(struct virtio_nvme_dev *dev)
+{
+       if (dev->admin_q && !blk_queue_dying(dev->admin_q)) {
+               blk_cleanup_queue(dev->admin_q);
+               blk_mq_free_tag_set(&dev->admin_tagset);
+       }
+}
+
+static int virtnvme_alloc_admin_tags(struct virtio_nvme_dev *dev)
+{
+       if (!dev->admin_q) {
+               dev->admin_tagset.ops = &virtio_nvme_mq_admin_ops;
+               dev->admin_tagset.nr_hw_queues = 1;
+               dev->admin_tagset.queue_depth = NVME_AQ_DEPTH;
+               dev->admin_tagset.reserved_tags = 1;
+               dev->admin_tagset.timeout = ADMIN_TIMEOUT;
+               dev->admin_tagset.numa_node = NUMA_NO_NODE;
+               dev->admin_tagset.cmd_size = virtnvme_cmd_size(dev);
+               dev->admin_tagset.driver_data = dev;
+
+               if (blk_mq_alloc_tag_set(&dev->admin_tagset))
+                       return -ENOMEM;
+
+               dev->admin_q = blk_mq_init_queue(&dev->admin_tagset);
+               if (IS_ERR(dev->admin_q)) {
+                       blk_mq_free_tag_set(&dev->admin_tagset);
+                       return -ENOMEM;
+               }
+               if (!blk_get_queue(dev->admin_q)) {
+                       virtnvme_dev_remove_admin(dev);
+                       dev->admin_q = NULL;
+                       return -ENODEV;
+               }
+       } else
+               blk_mq_unfreeze_queue(dev->admin_q);
+
+       return 0;
+}
+
+static int virtnvme_probe(struct virtio_device *vdev)
+{
+       struct virtio_nvme_dev *dev;
+       u64 cap;
+       u32 ctrl_config;
+       u32 sg_elems;
+       int err;
+
+       if (!vdev->config->get) {
+               printk("%s failure: config access disabled\n", __func__);
+               return -EINVAL;
+       }
+
+       vdev->priv = dev = kzalloc(sizeof(*dev), GFP_KERNEL);
+       if (!dev)
+               return -ENOMEM;
+       INIT_LIST_HEAD(&dev->namespaces);
+       kref_init(&dev->kref);
+
+       init_waitqueue_head(&dev->queue_wait);
+       dev->vdev = vdev;
+
+       err = nvme_set_instance(dev);
+       if (err)
+               goto out_free_dev;
+
+       /* We need to know how many segments before we allocate. */
+       err = virtio_cread_feature(vdev, VIRTIO_NVME_F_SEG_MAX,
+                                  struct virtio_nvme_config, seg_max,
+                                  &sg_elems);
+       /* We need at least one SG element, whatever they say. */
+       if (err || !sg_elems)
+               sg_elems = 1;
+
+       /* We need two extra sg elements at head for command and response */
+       sg_elems += 2;
+       dev->sg_elems = sg_elems;
+
+       /*
+        * 1. The host determines the controller capabilities
+        */
+       virtio_cread(vdev, struct virtio_nvme_config, cap, &cap);
+
+       /*
+        * 2. The host configures controller settings. Specific settings 
include:
+        *      a. The arbitration mechanism should be selected in CC.AMS.
+        *      b. The memory page size should be initialized in CC.MPS.
+        *      c. The I/O Command Set that is to be used should be selected in 
CC.CSS.
+        * 3. The controller should be enabled by setting CC.EN to 1
+        */
+       ctrl_config = NVME_CC_ENABLE | NVME_CC_CSS_NVM;
+       ctrl_config |= (PAGE_SHIFT - 12) << NVME_CC_MPS_SHIFT;
+       ctrl_config |= NVME_CC_ARB_RR | NVME_CC_SHN_NONE;
+       ctrl_config |= NVME_CC_IOSQES | NVME_CC_IOCQES;
+       virtio_cwrite(vdev, struct virtio_nvme_config, ctrl_config, 
&ctrl_config);
+
+       /*
+        * 4. The host should wait for the controller to indicate it is ready to
+        *    process commands. The controller is ready to process commands when
+        *    CSTS.RDY is set to 1.
+        */
+       err = virtnvme_wait_ready(dev, cap);
+       if (err)
+               goto release;
+
+       /* Qemu starts controller and creates VQs */
+       err = virtnvme_init_vq(dev);
+       if (err)
+               goto release;
+
+       err = virtnvme_alloc_admin_tags(dev);
+       if (err)
+               goto release;
+
+       spin_lock(&dev_list_lock);
+       list_add(&dev->node, &dev_list);
+       spin_unlock(&dev_list_lock);
+
+       /*
+        * 6. The host should determine the configuration of the controller by
+        *    issuing the Identify command, specifying the Controller data
+        *    structure. The host should then determine the configuration of
+        *    each namespace by issuing the Identify command for each namespace,
+        *    specifying the Namespace data structure
+        */
+       err = virtnvme_dev_add(dev);
+       if (err)
+               goto out_free_vq;
+
+       return 0;
+
+out_free_vq:
+       vdev->config->del_vqs(vdev);
+
+release:
+       virtnvme_release_instance(dev);
+
+out_free_dev:
+       kfree(dev);
+       return err;
+}
+
+static void virtnvme_ns_remove(struct virtio_nvme_ns *ns)
+{
+       bool kill = !blk_queue_dying(ns->queue);
+
+       if (kill)
+               blk_set_queue_dying(ns->queue);
+       if (ns->disk->flags & GENHD_FL_UP) {
+               if (blk_get_integrity(ns->disk))
+                       blk_integrity_unregister(ns->disk);
+               del_gendisk(ns->disk);
+       }
+       if (kill || !blk_queue_dying(ns->queue)) {
+               blk_mq_abort_requeue_list(ns->queue);
+               blk_cleanup_queue(ns->queue);
+        }
+}
+
+static void virtnvme_dev_remove(struct virtio_nvme_dev *dev)
+{
+       struct virtio_nvme_ns *ns;
+
+       list_for_each_entry(ns, &dev->namespaces, list)
+               virtnvme_ns_remove(ns);
+}
+
+static void virtnvme_free_namespace(struct virtio_nvme_ns *ns)
+{
+       list_del(&ns->list);
+
+       spin_lock(&dev_list_lock);
+       ns->disk->private_data = NULL;
+       spin_unlock(&dev_list_lock);
+
+       put_disk(ns->disk);
+       kfree(ns);
+}
+
+static void virtnvme_free_namespaces(struct virtio_nvme_dev *dev)
+{
+       struct virtio_nvme_ns *ns, *next;
+
+       list_for_each_entry_safe(ns, next, &dev->namespaces, list)
+               virtnvme_free_namespace(ns);
+}
+
+static void virtnvme_remove(struct virtio_device *vdev)
+{
+       struct virtio_nvme_dev *dev = vdev->priv;
+
+       spin_lock(&dev_list_lock);
+       list_del_init(&dev->node);
+       spin_unlock(&dev_list_lock);
+
+       /* Stop all the virtqueues. */
+       vdev->config->reset(vdev);
+
+       vdev->config->del_vqs(vdev);
+
+       virtnvme_dev_remove(dev);
+       virtnvme_dev_remove_admin(dev);
+
+       blk_mq_free_tag_set(&dev->tagset);
+       kfree(dev->vqs);
+
+       kref_put(&dev->kref, virtnvme_free_dev);
+}
+
+static unsigned int features[] = {
+       VIRTIO_NVME_F_SEG_MAX, VIRTIO_NVME_F_MQ,
+};
+
+static struct virtio_driver virtio_nvme_driver = {
+       .feature_table                  = features,
+       .feature_table_size             = ARRAY_SIZE(features),
+       .driver.name                    = KBUILD_MODNAME,
+       .driver.owner                   = THIS_MODULE,
+       .id_table                       = id_table,
+       .probe                          = virtnvme_probe,
+       .remove                         = virtnvme_remove,
+};
+
+static int __init virtnvme_init(void)
+{
+       int error;
+
+       virtnvme_major = register_blkdev(0, "virtnvme");
+       if (virtnvme_major < 0) {
+               error = virtnvme_major;
+               goto out;
+       }
+
+       error = register_virtio_driver(&virtio_nvme_driver);
+       if (error)
+               goto out_unregister_blkdev;
+       return 0;
+
+out_unregister_blkdev:
+       unregister_blkdev(virtnvme_major, "virtnvme");
+out:
+       return error;
+}
+
+static void __exit virtnvme_exit(void)
+{
+       unregister_virtio_driver(&virtio_nvme_driver);
+       unregister_blkdev(virtnvme_major, "virtnvme");
+}
+module_init(virtnvme_init);
+module_exit(virtnvme_exit);
+
+MODULE_DEVICE_TABLE(virtio, id_table);
+MODULE_DESCRIPTION("Virtio NVMe driver");
+MODULE_LICENSE("GPL");
+MODULE_AUTHOR("Ming Lin <min...@ssi.samsung.com>");
diff --git a/include/linux/virtio_nvme.h b/include/linux/virtio_nvme.h
new file mode 100644
index 0000000..c8db9a2
--- /dev/null
+++ b/include/linux/virtio_nvme.h
@@ -0,0 +1,53 @@
+#ifndef _LINUX_VIRTIO_NVME_H
+#define _LINUX_VIRTIO_NVME_H
+
+#include <uapi/linux/virtio_nvme.h>
+#include <linux/blk-mq.h>
+
+#define VQ_NAME_LEN 16
+
+struct virtio_nvme_dev;
+struct virtio_nvme_queue {
+       struct virtio_nvme_dev *dev;
+       struct virtqueue *vq;
+       spinlock_t lock;
+       char name[VQ_NAME_LEN];
+} ____cacheline_aligned_in_smp;
+
+struct virtio_nvme_dev {
+       struct virtio_device *vdev;
+       wait_queue_head_t queue_wait;
+       struct request_queue *admin_q;
+       struct blk_mq_tag_set admin_tagset;
+       struct blk_mq_tag_set tagset;
+
+       /* num of vqs */
+       int num_vqs;
+       struct virtio_nvme_queue *vqs;
+       struct list_head node;
+       int instance;
+       u32 ctrl_config;
+       struct list_head namespaces;
+       struct kref kref;
+       char name[12];
+       char serial[20];
+       char model[40];
+       char firmware_rev[8];
+       u32 max_hw_sectors;
+
+       unsigned int sg_elems;
+};
+
+struct virtio_nvme_ns {
+       struct list_head list;
+
+       struct virtio_nvme_dev *dev;
+       struct request_queue *queue;
+       struct gendisk *disk;
+
+       unsigned ns_id;
+       int lba_shift;
+       int ms;
+};
+
+#endif
diff --git a/include/uapi/linux/virtio_ids.h b/include/uapi/linux/virtio_ids.h
index 77925f5..d59d323 100644
--- a/include/uapi/linux/virtio_ids.h
+++ b/include/uapi/linux/virtio_ids.h
@@ -41,5 +41,6 @@
 #define VIRTIO_ID_CAIF        12 /* Virtio caif */
 #define VIRTIO_ID_GPU          16 /* virtio GPU */
 #define VIRTIO_ID_INPUT        18 /* virtio input */
+#define VIRTIO_ID_NVME         19 /* TBD: virtio NVMe, need Redhat's help to 
get this id */
 
 #endif /* _LINUX_VIRTIO_IDS_H */
diff --git a/include/uapi/linux/virtio_nvme.h b/include/uapi/linux/virtio_nvme.h
new file mode 100644
index 0000000..33f6077
--- /dev/null
+++ b/include/uapi/linux/virtio_nvme.h
@@ -0,0 +1,30 @@
+#ifndef _UAPI_LINUX_VIRTIO_NVME_H
+#define _UAPI_LINUX_VIRTIO_NVME_H
+
+#include <linux/types.h>
+#include <linux/virtio_ids.h>
+#include <linux/virtio_config.h>
+#include <linux/virtio_types.h>
+
+/* Feature bits */
+#define VIRTIO_NVME_F_SEG_MAX  1       /* Indicates maximum # of segments */
+#define VIRTIO_NVME_F_MQ       2       /* support more than one vq */
+
+struct virtio_nvme_config {
+       __u64   cap;
+       __u32   ctrl_config;
+       __u32   csts;
+
+       /* The maximum number of segments (if VIRTIO_NVME_F_SEG_MAX) */
+       __u32   seg_max;
+       /* number of vqs, only available when VIRTIO_NVME_F_MQ is set */
+       __u32 num_queues;
+} __attribute__((packed));
+
+struct virtio_nvme_resp {
+    __u32      result;
+    __u16      cid;
+    __u16      status;
+};
+
+#endif
-- 
1.9.1

_______________________________________________
Virtualization mailing list
Virtualization@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/virtualization

Reply via email to