Re: [Qemu-block] [PATCH v6 3/9] block: Add VFIO based NVMe driver

2018-02-13 Thread Eric Blake

On 01/16/2018 12:08 AM, Fam Zheng wrote:

This is a new protocol driver that exclusively opens a host NVMe
controller through VFIO. It achieves better latency than linux-aio by
completely bypassing host kernel vfs/block layer.

 $rw-$bs-$iodepth  linux-aio nvme://
 
 randread-4k-1 10.5k 21.6k
 randread-512k-1   745   1591
 randwrite-4k-130.7k 37.0k
 randwrite-512k-1  1945  1980

 (unit: IOPS)

The driver also integrates with the polling mechanism of iothread.

This patch is co-authored by Paolo and me.

Signed-off-by: Paolo Bonzini 
Signed-off-by: Fam Zheng 
Message-Id: <20180110091846.10699-4-f...@redhat.com>
---


Sorry for not noticing sooner, but


+static int64_t coroutine_fn nvme_co_get_block_status(BlockDriverState *bs,
+ int64_t sector_num,
+ int nb_sectors, int *pnum,
+ BlockDriverState **file)
+{
+*pnum = nb_sectors;
+*file = bs;
+
+return BDRV_BLOCK_ALLOCATED | BDRV_BLOCK_OFFSET_VALID |
+   (sector_num << BDRV_SECTOR_BITS);


This is wrong.  Drivers should only ever return BDRV_BLOCK_DATA (which 
io.c then _adds_ BDRV_BLOCK_ALLOCATED to, as needed).  I'll fix it up as 
part of my byte-based block status series (v8 coming up soon).


--
Eric Blake, Principal Software Engineer
Red Hat, Inc.   +1-919-301-3266
Virtualization:  qemu.org | libvirt.org



[Qemu-block] [PATCH v6 3/9] block: Add VFIO based NVMe driver

2018-01-15 Thread Fam Zheng
This is a new protocol driver that exclusively opens a host NVMe
controller through VFIO. It achieves better latency than linux-aio by
completely bypassing host kernel vfs/block layer.

$rw-$bs-$iodepth  linux-aio nvme://

randread-4k-1 10.5k 21.6k
randread-512k-1   745   1591
randwrite-4k-130.7k 37.0k
randwrite-512k-1  1945  1980

(unit: IOPS)

The driver also integrates with the polling mechanism of iothread.

This patch is co-authored by Paolo and me.

Signed-off-by: Paolo Bonzini 
Signed-off-by: Fam Zheng 
Message-Id: <20180110091846.10699-4-f...@redhat.com>
---
 MAINTAINERS |6 +
 block/Makefile.objs |1 +
 block/nvme.c| 1180 +++
 block/trace-events  |   21 +
 4 files changed, 1208 insertions(+)
 create mode 100644 block/nvme.c

diff --git a/MAINTAINERS b/MAINTAINERS
index 4770f105d4..bd636a4bff 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -1876,6 +1876,12 @@ L: qemu-block@nongnu.org
 S: Supported
 F: block/null.c
 
+NVMe Block Driver
+M: Fam Zheng 
+L: qemu-block@nongnu.org
+S: Supported
+F: block/nvme*
+
 Bootdevice
 M: Gonglei 
 S: Maintained
diff --git a/block/Makefile.objs b/block/Makefile.objs
index 6eaf78a046..4c7e9d84a7 100644
--- a/block/Makefile.objs
+++ b/block/Makefile.objs
@@ -11,6 +11,7 @@ block-obj-$(CONFIG_POSIX) += file-posix.o
 block-obj-$(CONFIG_LINUX_AIO) += linux-aio.o
 block-obj-y += null.o mirror.o commit.o io.o
 block-obj-y += throttle-groups.o
+block-obj-$(CONFIG_LINUX) += nvme.o
 
 block-obj-y += nbd.o nbd-client.o sheepdog.o
 block-obj-$(CONFIG_LIBISCSI) += iscsi.o
diff --git a/block/nvme.c b/block/nvme.c
new file mode 100644
index 00..99cc7702ad
--- /dev/null
+++ b/block/nvme.c
@@ -0,0 +1,1180 @@
+/*
+ * NVMe block driver based on vfio
+ *
+ * Copyright 2016 - 2018 Red Hat, Inc.
+ *
+ * Authors:
+ *   Fam Zheng 
+ *   Paolo Bonzini 
+ *
+ * This work is licensed under the terms of the GNU GPL, version 2 or later.
+ * See the COPYING file in the top-level directory.
+ */
+
+#include "qemu/osdep.h"
+#include 
+#include "qapi/error.h"
+#include "qapi/qmp/qdict.h"
+#include "qapi/qmp/qstring.h"
+#include "qemu/error-report.h"
+#include "qemu/cutils.h"
+#include "qemu/vfio-helpers.h"
+#include "block/block_int.h"
+#include "trace.h"
+
+/* TODO: Move nvme spec definitions from hw/block/nvme.h into a separate file
+ * that doesn't depend on dma/pci headers. */
+#include "sysemu/dma.h"
+#include "hw/pci/pci.h"
+#include "hw/block/block.h"
+#include "hw/block/nvme.h"
+
+#define NVME_SQ_ENTRY_BYTES 64
+#define NVME_CQ_ENTRY_BYTES 16
+#define NVME_QUEUE_SIZE 128
+#define NVME_BAR_SIZE 8192
+
+typedef struct {
+int32_t  head, tail;
+uint8_t  *queue;
+uint64_t iova;
+/* Hardware MMIO register */
+volatile uint32_t *doorbell;
+} NVMeQueue;
+
+typedef struct {
+BlockCompletionFunc *cb;
+void *opaque;
+int cid;
+void *prp_list_page;
+uint64_t prp_list_iova;
+bool busy;
+} NVMeRequest;
+
+typedef struct {
+CoQueue free_req_queue;
+QemuMutex   lock;
+
+/* Fields protected by BQL */
+int index;
+uint8_t *prp_list_pages;
+
+/* Fields protected by @lock */
+NVMeQueue   sq, cq;
+int cq_phase;
+NVMeRequest reqs[NVME_QUEUE_SIZE];
+boolbusy;
+int need_kick;
+int inflight;
+} NVMeQueuePair;
+
+/* Memory mapped registers */
+typedef volatile struct {
+uint64_t cap;
+uint32_t vs;
+uint32_t intms;
+uint32_t intmc;
+uint32_t cc;
+uint32_t reserved0;
+uint32_t csts;
+uint32_t nssr;
+uint32_t aqa;
+uint64_t asq;
+uint64_t acq;
+uint32_t cmbloc;
+uint32_t cmbsz;
+uint8_t  reserved1[0xec0];
+uint8_t  cmd_set_specfic[0x100];
+uint32_t doorbells[];
+} QEMU_PACKED NVMeRegs;
+
+QEMU_BUILD_BUG_ON(offsetof(NVMeRegs, doorbells) != 0x1000);
+
+typedef struct {
+AioContext *aio_context;
+QEMUVFIOState *vfio;
+NVMeRegs *regs;
+/* The submission/completion queue pairs.
+ * [0]: admin queue.
+ * [1..]: io queues.
+ */
+NVMeQueuePair **queues;
+int nr_queues;
+size_t page_size;
+/* How many uint32_t elements does each doorbell entry take. */
+size_t doorbell_scale;
+bool write_cache_supported;
+EventNotifier irq_notifier;
+uint64_t nsze; /* Namespace size reported by identify command */
+int nsid;  /* The namespace id to read/write data. */
+uint64_t max_transfer;
+int plugged;
+
+CoMutex dma_map_lock;
+CoQueue dma_flush_queue;
+
+/* Total size of mapped qiov, accessed under dma_map_lock */
+int dma_map_count;
+} BDRVNVMeState;
+
+#define NVME_BLOCK_OPT_DEVICE "device"
+#define NVME_BLOCK_OPT_NAMESPACE "namespace"
+
+static