date:20201208

Re: [PATCH v11 08/13] block/nvme: Make ZNS-related definitions

2020-12-08 Thread Klaus Jensen

CC for Stefan (nvme block driver co-maintainer).

On Dec  9 05:04, Dmitry Fomichev wrote:
> Define values and structures that are needed to support Zoned
> Namespace Command Set (NVMe TP 4053).
> 
> Signed-off-by: Dmitry Fomichev 
> ---
>  include/block/nvme.h | 114 ++-
>  1 file changed, 113 insertions(+), 1 deletion(-)
> 
> diff --git a/include/block/nvme.h b/include/block/nvme.h
> index 29d826ab19..a9165402d6 100644
> --- a/include/block/nvme.h
> +++ b/include/block/nvme.h
> @@ -489,6 +489,9 @@ enum NvmeIoCommands {
>  NVME_CMD_COMPARE= 0x05,
>  NVME_CMD_WRITE_ZEROES   = 0x08,
>  NVME_CMD_DSM= 0x09,
> +NVME_CMD_ZONE_MGMT_SEND = 0x79,
> +NVME_CMD_ZONE_MGMT_RECV = 0x7a,
> +NVME_CMD_ZONE_APPEND= 0x7d,
>  };
>  
>  typedef struct QEMU_PACKED NvmeDeleteQ {
> @@ -648,9 +651,13 @@ typedef struct QEMU_PACKED NvmeAerResult {
>  uint8_t resv;
>  } NvmeAerResult;
>  
> +typedef struct QEMU_PACKED NvmeZonedResult {
> +uint64_t slba;
> +} NvmeZonedResult;
> +
>  typedef struct QEMU_PACKED NvmeCqe {
>  uint32_tresult;
> -uint32_trsvd;
> +uint32_tdw1;
>  uint16_tsq_head;
>  uint16_tsq_id;
>  uint16_tcid;
> @@ -679,6 +686,7 @@ enum NvmeStatusCodes {
>  NVME_INVALID_USE_OF_CMB = 0x0012,
>  NVME_INVALID_PRP_OFFSET = 0x0013,
>  NVME_CMD_SET_CMB_REJECTED   = 0x002b,
> +NVME_INVALID_CMD_SET= 0x002c,
>  NVME_LBA_RANGE  = 0x0080,
>  NVME_CAP_EXCEEDED   = 0x0081,
>  NVME_NS_NOT_READY   = 0x0082,
> @@ -703,6 +711,14 @@ enum NvmeStatusCodes {
>  NVME_CONFLICTING_ATTRS  = 0x0180,
>  NVME_INVALID_PROT_INFO  = 0x0181,
>  NVME_WRITE_TO_RO= 0x0182,
> +NVME_ZONE_BOUNDARY_ERROR= 0x01b8,
> +NVME_ZONE_FULL  = 0x01b9,
> +NVME_ZONE_READ_ONLY = 0x01ba,
> +NVME_ZONE_OFFLINE   = 0x01bb,
> +NVME_ZONE_INVALID_WRITE = 0x01bc,
> +NVME_ZONE_TOO_MANY_ACTIVE   = 0x01bd,
> +NVME_ZONE_TOO_MANY_OPEN = 0x01be,
> +NVME_ZONE_INVAL_TRANSITION  = 0x01bf,
>  NVME_WRITE_FAULT= 0x0280,
>  NVME_UNRECOVERED_READ   = 0x0281,
>  NVME_E2E_GUARD_ERROR= 0x0282,
> @@ -888,6 +904,11 @@ typedef struct QEMU_PACKED NvmeIdCtrl {
>  uint8_t vs[1024];
>  } NvmeIdCtrl;
>  
> +typedef struct NvmeIdCtrlZoned {
> +uint8_t zasl;
> +uint8_t rsvd1[4095];
> +} NvmeIdCtrlZoned;
> +
>  enum NvmeIdCtrlOacs {
>  NVME_OACS_SECURITY  = 1 << 0,
>  NVME_OACS_FORMAT= 1 << 1,
> @@ -1016,6 +1037,12 @@ typedef struct QEMU_PACKED NvmeLBAF {
>  uint8_t rp;
>  } NvmeLBAF;
>  
> +typedef struct QEMU_PACKED NvmeLBAFE {
> +uint64_tzsze;
> +uint8_t zdes;
> +uint8_t rsvd9[7];
> +} NvmeLBAFE;
> +
>  #define NVME_NSID_BROADCAST 0x
>  
>  typedef struct QEMU_PACKED NvmeIdNs {
> @@ -1075,10 +1102,24 @@ enum NvmeNsIdentifierType {
>  
>  enum NvmeCsi {
>  NVME_CSI_NVM= 0x00,
> +NVME_CSI_ZONED  = 0x02,
>  };
>  
>  #define NVME_SET_CSI(vec, csi) (vec |= (uint8_t)(1 << (csi)))
>  
> +typedef struct QEMU_PACKED NvmeIdNsZoned {
> +uint16_tzoc;
> +uint16_tozcs;
> +uint32_tmar;
> +uint32_tmor;
> +uint32_trrl;
> +uint32_tfrl;
> +uint8_t rsvd20[2796];
> +NvmeLBAFE   lbafe[16];
> +uint8_t rsvd3072[768];
> +uint8_t vs[256];
> +} NvmeIdNsZoned;
> +
>  /*Deallocate Logical Block Features*/
>  #define NVME_ID_NS_DLFEAT_GUARD_CRC(dlfeat)   ((dlfeat) & 0x10)
>  #define NVME_ID_NS_DLFEAT_WRITE_ZEROES(dlfeat)((dlfeat) & 0x08)
> @@ -,10 +1152,76 @@ enum NvmeIdNsDps {
>  DPS_FIRST_EIGHT = 8,
>  };
>  
> +enum NvmeZoneAttr {
> +NVME_ZA_FINISHED_BY_CTLR = 1 << 0,
> +NVME_ZA_FINISH_RECOMMENDED   = 1 << 1,
> +NVME_ZA_RESET_RECOMMENDED= 1 << 2,
> +NVME_ZA_ZD_EXT_VALID = 1 << 7,
> +};
> +
> +typedef struct QEMU_PACKED NvmeZoneReportHeader {
> +uint64_tnr_zones;
> +uint8_t rsvd[56];
> +} NvmeZoneReportHeader;
> +
> +enum NvmeZoneReceiveAction {
> +NVME_ZONE_REPORT = 0,
> +NVME_ZONE_REPORT_EXTENDED= 1,
> +};
> +
> +enum NvmeZoneReportType {
> +NVME_ZONE_REPORT_ALL = 0,
> +NVME_ZONE_REPORT_EMPTY   = 1,
> +NVME_ZONE_REPORT_IMPLICITLY_OPEN = 2,
> +NVME_ZONE_REPORT_EXPLICITLY_OPEN = 3,
> +NVME_ZONE_REPORT_CLOSED  = 4,
> +NVME_ZONE_REPORT_FULL= 5,
> +NVME_ZONE_REPORT_READ_ONLY   = 6,
> +NVME_ZONE_REPORT_OFFLINE = 7,
> +};
> +
> +enum NvmeZoneType {
> +NVME_ZONE_TYPE_RESERVED  = 0x00,
> +NVME_ZONE_TYPE_SEQ_WRITE = 0x02,
> +};
> +
> +enum NvmeZoneSendAction {
> +NVME_ZONE_ACTION_RSD = 0x00,
> +NVME_ZONE_ACTION_CLOSE   = 0x01,
> +

[PATCH v2] file-posix: detect the lock using the real file

2020-12-08 Thread Li Feng

This patch addresses this issue:
When accessing a volume on an NFS filesystem without supporting the file lock,
tools, like qemu-img, will complain "Failed to lock byte 100".

In the original code, the qemu_has_ofd_lock will test the lock on the
"/dev/null" pseudo-file. Actually, the file.locking is per-drive property,
which depends on the underlay filesystem.

In this patch, make the 'qemu_has_ofd_lock' with a filename be more
generic and reasonable.

Signed-off-by: Li Feng 
---
v2: remove the refactoring.
---
 block/file-posix.c | 32 ++--
 include/qemu/osdep.h   |  2 +-
 tests/test-image-locking.c |  2 +-
 util/osdep.c   | 19 ---
 4 files changed, 32 insertions(+), 23 deletions(-)

diff --git a/block/file-posix.c b/block/file-posix.c
index 806764f7e3..03be1b188c 100644
--- a/block/file-posix.c
+++ b/block/file-posix.c
@@ -595,7 +595,7 @@ static int raw_open_common(BlockDriverState *bs, QDict 
*options,
 switch (locking) {
 case ON_OFF_AUTO_ON:
 s->use_lock = true;
-if (!qemu_has_ofd_lock()) {
+if (!qemu_has_ofd_lock(filename)) {
 warn_report("File lock requested but OFD locking syscall is "
 "unavailable, falling back to POSIX file locks");
 error_printf("Due to the implementation, locks can be lost "
@@ -606,7 +606,7 @@ static int raw_open_common(BlockDriverState *bs, QDict 
*options,
 s->use_lock = false;
 break;
 case ON_OFF_AUTO_AUTO:
-s->use_lock = qemu_has_ofd_lock();
+s->use_lock = qemu_has_ofd_lock(filename);
 break;
 default:
 abort();
@@ -2388,6 +2388,7 @@ raw_co_create(BlockdevCreateOptions *options, Error 
**errp)
 int fd;
 uint64_t perm, shared;
 int result = 0;
+bool use_lock;
 
 /* Validate options and set default values */
 assert(options->driver == BLOCKDEV_DRIVER_FILE);
@@ -2428,19 +2429,22 @@ raw_co_create(BlockdevCreateOptions *options, Error 
**errp)
 perm = BLK_PERM_WRITE | BLK_PERM_RESIZE;
 shared = BLK_PERM_ALL & ~BLK_PERM_RESIZE;
 
-/* Step one: Take locks */
-result = raw_apply_lock_bytes(NULL, fd, perm, ~shared, false, errp);
-if (result < 0) {
-goto out_close;
-}
+use_lock = qemu_has_ofd_lock(file_opts->filename);
+if (use_lock) {
+/* Step one: Take locks */
+result = raw_apply_lock_bytes(NULL, fd, perm, ~shared, false, errp);
+if (result < 0) {
+goto out_close;
+}
 
-/* Step two: Check that nobody else has taken conflicting locks */
-result = raw_check_lock_bytes(fd, perm, shared, errp);
-if (result < 0) {
-error_append_hint(errp,
-  "Is another process using the image [%s]?\n",
-  file_opts->filename);
-goto out_unlock;
+/* Step two: Check that nobody else has taken conflicting locks */
+result = raw_check_lock_bytes(fd, perm, shared, errp);
+if (result < 0) {
+error_append_hint(errp,
+  "Is another process using the image [%s]?\n",
+  file_opts->filename);
+goto out_unlock;
+}
 }
 
 /* Clear the file by truncating it to 0 */
diff --git a/include/qemu/osdep.h b/include/qemu/osdep.h
index f9ec8c84e9..349adad465 100644
--- a/include/qemu/osdep.h
+++ b/include/qemu/osdep.h
@@ -512,7 +512,7 @@ int qemu_dup(int fd);
 int qemu_lock_fd(int fd, int64_t start, int64_t len, bool exclusive);
 int qemu_unlock_fd(int fd, int64_t start, int64_t len);
 int qemu_lock_fd_test(int fd, int64_t start, int64_t len, bool exclusive);
-bool qemu_has_ofd_lock(void);
+bool qemu_has_ofd_lock(const char *filename);
 #endif
 
 #if defined(__HAIKU__) && defined(__i386__)
diff --git a/tests/test-image-locking.c b/tests/test-image-locking.c
index ba057bd66c..3e80246081 100644
--- a/tests/test-image-locking.c
+++ b/tests/test-image-locking.c
@@ -149,7 +149,7 @@ int main(int argc, char **argv)
 
 g_test_init(, , NULL);
 
-if (qemu_has_ofd_lock()) {
+if (qemu_has_ofd_lock(NULL)) {
 g_test_add_func("/image-locking/basic", test_image_locking_basic);
 g_test_add_func("/image-locking/set-perm-abort", test_set_perm_abort);
 }
diff --git a/util/osdep.c b/util/osdep.c
index 66d01b9160..20119aa9ae 100644
--- a/util/osdep.c
+++ b/util/osdep.c
@@ -187,7 +187,7 @@ static int qemu_parse_fdset(const char *param)
 return qemu_parse_fd(param);
 }
 
-static void qemu_probe_lock_ops(void)
+static void qemu_probe_lock_ops(const char *filename)
 {
 if (fcntl_op_setlk == -1) {
 #ifdef F_OFD_SETLK
@@ -200,10 +200,15 @@ static void qemu_probe_lock_ops(void)
 .l_type   = F_WRLCK,
 };
 
-fd = open("/dev/null", O_RDWR);
+if (filename) {
+fd = open(filename, O_RDWR);
+} else {
+fd = open("/dev/null", O_RDONLY);
+}

Re: [PATCH] file-posix: detect the lock using the real file

2020-12-08 Thread Li Feng

Kevin Wolf  于2020年12月8日周二 下午10:38写道：
>
> Am 08.12.2020 um 13:59 hat Li Feng geschrieben:
> > This patch addresses this issue:
> > When accessing a volume on an NFS filesystem without supporting the file 
> > lock,
> > tools, like qemu-img, will complain "Failed to lock byte 100".
> >
> > In the original code, the qemu_has_ofd_lock will test the lock on the
> > "/dev/null" pseudo-file. Actually, the file.locking is per-drive property,
> > which depends on the underlay filesystem.
> >
> > In this patch, make the 'qemu_has_ofd_lock' with a filename be more generic
> > and reasonable.
> >
> > Signed-off-by: Li Feng 
>
> Do you know any way how I could configure either the NFS server or the
> NFS client such that locking would fail? For any patch related to this,
> it would be good if I could even test the scenario.
>
Hi Kevin, currently our SmartX ZBS storage NFS server doesn't support
the file lock and the lock operation will return failure.
I have tried the kernel NFS server, and it works well. I don't have more kinds
of NFS servers.

> For this specific patch, I think Daniel has already provided a good
> explanation of the fundamental problems it has.
>
> Kevin
>

Re: [PATCH] file-posix: detect the lock using the real file

2020-12-08 Thread Li Feng

Daniel P. Berrangé  于2020年12月8日周二 下午9:45写道：
>
> On Tue, Dec 08, 2020 at 08:59:37PM +0800, Li Feng wrote:
> > This patch addresses this issue:
> > When accessing a volume on an NFS filesystem without supporting the file 
> > lock,
> > tools, like qemu-img, will complain "Failed to lock byte 100".
> >
> > In the original code, the qemu_has_ofd_lock will test the lock on the
> > "/dev/null" pseudo-file. Actually, the file.locking is per-drive property,
> > which depends on the underlay filesystem.
>
> IIUC, the problem you're describing is one of whether the filesystem
> supports fcntl locking at all, which is indeed a per-FS check.
>
> The QEMU code being changed though is just about detecting whether
> the host OS supports OFD to not, which is supposed to be a kernel
> level feature applied  universally to all FS types.
>
> >
> > In this patch, make the 'qemu_has_ofd_lock' with a filename be more generic
> > and reasonable.
> >
> > Signed-off-by: Li Feng 
> > ---
> >  block/file-posix.c | 32 +++-
> >  include/qemu/osdep.h   |  2 +-
> >  tests/test-image-locking.c |  2 +-
> >  util/osdep.c   | 43 --
> >  4 files changed, 47 insertions(+), 32 deletions(-)
> >
> > diff --git a/block/file-posix.c b/block/file-posix.c
> > index 806764f7e3..03be1b188c 100644
> > --- a/block/file-posix.c
> > +++ b/block/file-posix.c
> > @@ -595,7 +595,7 @@ static int raw_open_common(BlockDriverState *bs, QDict 
> > *options,
> >  switch (locking) {
> >  case ON_OFF_AUTO_ON:
> >  s->use_lock = true;
> > -if (!qemu_has_ofd_lock()) {
> > +if (!qemu_has_ofd_lock(filename)) {
> >  warn_report("File lock requested but OFD locking syscall is "
> >  "unavailable, falling back to POSIX file locks");
> >  error_printf("Due to the implementation, locks can be lost "
> > @@ -606,7 +606,7 @@ static int raw_open_common(BlockDriverState *bs, QDict 
> > *options,
> >  s->use_lock = false;
> >  break;
> >  case ON_OFF_AUTO_AUTO:
> > -s->use_lock = qemu_has_ofd_lock();
> > +s->use_lock = qemu_has_ofd_lock(filename);
> >  break;
> >  default:
> >  abort();
> > @@ -2388,6 +2388,7 @@ raw_co_create(BlockdevCreateOptions *options, Error 
> > **errp)
> >  int fd;
> >  uint64_t perm, shared;
> >  int result = 0;
> > +bool use_lock;
> >
> >  /* Validate options and set default values */
> >  assert(options->driver == BLOCKDEV_DRIVER_FILE);
> > @@ -2428,19 +2429,22 @@ raw_co_create(BlockdevCreateOptions *options, Error 
> > **errp)
> >  perm = BLK_PERM_WRITE | BLK_PERM_RESIZE;
> >  shared = BLK_PERM_ALL & ~BLK_PERM_RESIZE;
> >
> > -/* Step one: Take locks */
> > -result = raw_apply_lock_bytes(NULL, fd, perm, ~shared, false, errp);
> > -if (result < 0) {
> > -goto out_close;
> > -}
> > +use_lock = qemu_has_ofd_lock(file_opts->filename);
> > +if (use_lock) {
> > +/* Step one: Take locks */
> > +result = raw_apply_lock_bytes(NULL, fd, perm, ~shared, false, 
> > errp);
> > +if (result < 0) {
> > +goto out_close;
> > +}
> >
> > -/* Step two: Check that nobody else has taken conflicting locks */
> > -result = raw_check_lock_bytes(fd, perm, shared, errp);
> > -if (result < 0) {
> > -error_append_hint(errp,
> > -  "Is another process using the image [%s]?\n",
> > -  file_opts->filename);
> > -goto out_unlock;
> > +/* Step two: Check that nobody else has taken conflicting locks */
> > +result = raw_check_lock_bytes(fd, perm, shared, errp);
> > +if (result < 0) {
> > +error_append_hint(errp,
> > +  "Is another process using the image [%s]?\n",
> > +  file_opts->filename);
> > +goto out_unlock;
> > +}
> >  }
> >
> >  /* Clear the file by truncating it to 0 */
> > diff --git a/include/qemu/osdep.h b/include/qemu/osdep.h
> > index f9ec8c84e9..349adad465 100644
> > --- a/include/qemu/osdep.h
> > +++ b/include/qemu/osdep.h
> > @@ -512,7 +512,7 @@ int qemu_dup(int fd);
> >  int qemu_lock_fd(int fd, int64_t start, int64_t len, bool exclusive);
> >  int qemu_unlock_fd(int fd, int64_t start, int64_t len);
> >  int qemu_lock_fd_test(int fd, int64_t start, int64_t len, bool exclusive);
> > -bool qemu_has_ofd_lock(void);
> > +bool qemu_has_ofd_lock(const char *filename);
> >  #endif
> >
> >  #if defined(__HAIKU__) && defined(__i386__)
> > diff --git a/tests/test-image-locking.c b/tests/test-image-locking.c
> > index ba057bd66c..3e80246081 100644
> > --- a/tests/test-image-locking.c
> > +++ b/tests/test-image-locking.c
> > @@ -149,7 +149,7 @@ int main(int argc, char **argv)
> >
> >  g_test_init(, , NULL);
> >
> > -if (qemu_has_ofd_lock()) {
> > +if

Re: [PATCH] hw/block/nvme: fix bad clearing of CAP

2020-12-08 Thread Keith Busch

On Tue, Dec 08, 2020 at 10:16:58AM +0100, Klaus Jensen wrote:
> From: Klaus Jensen 
> 
> Commit 37712e00b1f0 ("hw/block/nvme: factor out pmr setup") changed the
> control flow such that the CAP register is erronously cleared after
> nvme_init_pmr() has configured it. Since the entire NvmeCtrl structure
> is zero-filled initially, there is no need for the explicit clearing, so
> just remove it.
> 
> Fixes: 37712e00b1f0 ("hw/block/nvme: factor out pmr setup")
> Signed-off-by: Klaus Jensen 

Oops, nice catch.

Reviewed-by: Keith Busch 

> ---
>  hw/block/nvme.c | 1 -
>  1 file changed, 1 deletion(-)
> 
> diff --git a/hw/block/nvme.c b/hw/block/nvme.c
> index 8814201364c1..28416b18a5c0 100644
> --- a/hw/block/nvme.c
> +++ b/hw/block/nvme.c
> @@ -3040,7 +3040,6 @@ static void nvme_init_ctrl(NvmeCtrl *n, PCIDevice 
> *pci_dev)
>  id->psd[0].enlat = cpu_to_le32(0x10);
>  id->psd[0].exlat = cpu_to_le32(0x4);
>  
> -n->bar.cap = 0;
>  NVME_CAP_SET_MQES(n->bar.cap, 0x7ff);
>  NVME_CAP_SET_CQR(n->bar.cap, 1);
>  NVME_CAP_SET_TO(n->bar.cap, 0xf);
> -- 
> 2.29.2
>

Re: [PATCH v2] hw/block/nvme: add compare command

2020-12-08 Thread Keith Busch

On Thu, Nov 26, 2020 at 07:56:05PM +0100, Klaus Jensen wrote:
> From: Gollu Appalanaidu 
> 
> Add the Compare command.
> 
> This implementation uses a bounce buffer to read in the data from
> storage and then compare with the host supplied buffer.
> 
> Signed-off-by: Gollu Appalanaidu 
> [k.jensen: rebased]
> Signed-off-by: Klaus Jensen 

Looks good.

Reviewed-by: Keith Busch

Re: [PATCH v3 2/2] hw/block/nvme: add simple copy command

2020-12-08 Thread Keith Busch

On Tue, Dec 08, 2020 at 09:33:39AM +0100, Klaus Jensen wrote:
> +static uint16_t nvme_copy(NvmeCtrl *n, NvmeRequest *req)
> +{



> +for (i = 0; i < nr; i++) {
> +uint32_t _nlb = le16_to_cpu(range[i].nlb) + 1;
> +if (_nlb > le16_to_cpu(ns->id_ns.mssrl)) {
> +return NVME_CMD_SIZE_LIMIT | NVME_DNR;
> +}
> +
> +nlb += _nlb;
> +}
> +
> +if (nlb > le32_to_cpu(ns->id_ns.mcl)) {
> +return NVME_CMD_SIZE_LIMIT | NVME_DNR;
> +}
> +
> +bounce = bouncep = g_malloc(nvme_l2b(ns, nlb));
> +
> +for (i = 0; i < nr; i++) {
> +uint64_t slba = le64_to_cpu(range[i].slba);
> +uint32_t nlb = le16_to_cpu(range[i].nlb) + 1;
> +
> +status = nvme_check_bounds(ns, slba, nlb);
> +if (status) {
> +trace_pci_nvme_err_invalid_lba_range(slba, nlb, ns->id_ns.nsze);
> +goto free_bounce;
> +}
> +
> +if (NVME_ERR_REC_DULBE(ns->features.err_rec)) {
> +status = nvme_check_dulbe(ns, slba, nlb);
> +if (status) {
> +goto free_bounce;
> +}
> +}
> +}

Only comment I have is that these two for-loops look like they can be
collaped into one, which also simplifies how you account for the bounce
buffer when error'ing out.

Re: [PATCH v9 08/12] hw/block/nvme: Support Zoned Namespace Command Set

2020-12-08 Thread Klaus Jensen

On Dec  8 20:02, Dmitry Fomichev wrote:
> Hi Klaus,
> 
> Thank you for your review! Please see replies below...
> 
> 
> On Thu, 2020-11-12 at 20:36 +0100, Klaus Jensen wrote:
> > Hi Dmitry,
> > 
> > I know you posted v10, but my comments should be relevant to that as
> > well.
> > 
> > On Nov  5 11:53, Dmitry Fomichev wrote:
> > > @@ -133,6 +300,12 @@ static Property nvme_ns_props[] = {
> > >  DEFINE_BLOCK_PROPERTIES(NvmeNamespace, blkconf),
> > >  DEFINE_PROP_UINT32("nsid", NvmeNamespace, params.nsid, 0),
> > >  DEFINE_PROP_UUID("uuid", NvmeNamespace, params.uuid),
> > > +DEFINE_PROP_BOOL("zoned", NvmeNamespace, params.zoned, false),
> > 
> > I disagree on this. Using the "magic" value ensures that only one
> > command set can be selected. We can do a custom property so we can set
> > `iocs=zoned` as well as `iocs=0x2` if that makes it more user friendly?
> 
> I doubt that an average admin will even know what "iocs" really means, leave
> alone for knowing any magic values. On the other hand, it would be trivial
> to add a check to prevent users from doing zoned=true kv=true, etc. I don't
> see that as a big problem.
> 

OK, I'm fine with this.

> > 
> > > +DEFINE_PROP_SIZE("zoned.zsze", NvmeNamespace, params.zone_size_bs,
> > > + NVME_DEFAULT_ZONE_SIZE),
> > > +DEFINE_PROP_SIZE("zoned.zcap", NvmeNamespace, params.zone_cap_bs, 0),
> > > +DEFINE_PROP_BOOL("zoned.cross_read", NvmeNamespace,
> > > + params.cross_zone_read, false),
> > 
> > Same reason why I think we should just expose ozcs directly instead of
> > adding more parameters.
> > 
> > We are already adding a bunch of new parameters - might as well keep the
> > number as low as possible.
> 
> There is only RAZB that is defined in OZCS as of now and you will not be
> able to reduce the number of module parameters by exposing OZCS instead of
> RAZB. But telling the user what RAZB really means in the parameter name is,
> IMO, a better choice.
> 

The TP that shall not be named puts stuff in there but I'm OK with the
zoned.cross_read parameter.

> > > +static uint16_t nvme_zone_mgmt_recv(NvmeCtrl *n, NvmeRequest *req)
> > > +{
> > > +NvmeCmd *cmd = (NvmeCmd *)>cmd;
> > > +NvmeNamespace *ns = req->ns;
> > > +/* cdw12 is zero-based number of dwords to return. Convert to bytes 
> > > */
> > > +uint32_t data_size = (le32_to_cpu(cmd->cdw12) + 1) << 2;
> > > +uint32_t dw13 = le32_to_cpu(cmd->cdw13);
> > > +uint32_t zone_idx, zra, zrasf, partial;
> > > +uint64_t max_zones, nr_zones = 0;
> > > +uint16_t ret;
> > > +uint64_t slba;
> > > +NvmeZoneDescr *z;
> > > +NvmeZone *zs;
> > > +NvmeZoneReportHeader *header;
> > > +void *buf, *buf_p;
> > > +size_t zone_entry_sz;
> > > +
> > > +req->status = NVME_SUCCESS;
> > > +
> > > +ret = nvme_get_mgmt_zone_slba_idx(ns, cmd, , _idx);
> > > +if (ret) {
> > > +return ret;
> > > +}
> > 
> > Zone Management Receive does not specify anything for the given SLBA.
> > Out-of-bounds is acceptable, just results in no descriptors being
> > returned.
> 
> SLBA is an LBA in the lowest zone that is considered for reporting.
> The text in 4.4.1.1 a) says "report only zone descriptors for which
> the ZSLBA value is greater or equal to the ZSLBA value of the zone
> specified by the SLBA value in the command". The last part of this
> paragraph basically says that SLBA has to point to a zone, hence
> the error if it doesn't.
> 

Hmm. I tend to disagree since nowhere does the spec define that an error
should be returned if the given ZSLBA does not resolve to a valid zone.

> > > +
> > > +zone_idx++;
> > > +}
> > > +
> > > +if (!partial) {
> > > +for (; zone_idx < ns->num_zones; zone_idx++) {
> > > +zs = >zone_array[zone_idx];
> > > +if (nvme_zone_matches_filter(zrasf, zs)) {
> > > +nr_zones++;
> > > +}
> > > +}
> > > +}
> > 
> > I did something like this as well (only counting matching zones from
> > given SLBA), but when looking at the spec now, it seems that this is a
> > remnant from an older version of the spec? Please correct me if wrong.
> > 
> > On the Partial Report bit, the ratified specification just says that "If
> > this bit is cleared to '0', then the value in the Number of Zones field
> > indicates the number of zone descriptors that match the criteria in the
> > Zone Receive Action Specific field.".
> > 
> > So, I think if !partial, the Number of Zones field should not consider
> > the SLBA and just count from 0.
> 
> If Partial is 0, then the header contains the number of descriptors that
> can be potentially reported from SLBA until the end of LBA range if the
> buffer would be unlimited. If the Partial bit is 1, the same count is
> additionally limited by the number of descriptors that can fit to the
> provided buffer. Perhaps ZNS spec is not quite clear about this, but this
> is the way all

[PATCH v11 13/13] hw/block/nvme: Document zoned parameters in usage text

2020-12-08 Thread Dmitry Fomichev

Added brief descriptions of the new device properties that are
now available to users to configure features of Zoned Namespace
Command Set in the emulator.

This patch is for documentation only, no functionality change.

Signed-off-by: Dmitry Fomichev 
Reviewed-by: Niklas Cassel 
---
 hw/block/nvme.c | 47 ++-
 1 file changed, 42 insertions(+), 5 deletions(-)

diff --git a/hw/block/nvme.c b/hw/block/nvme.c
index c2336bfd67..fbb69c82c6 100644
--- a/hw/block/nvme.c
+++ b/hw/block/nvme.c
@@ -9,7 +9,7 @@
  */
 
 /**
- * Reference Specs: http://www.nvmexpress.org, 1.2, 1.1, 1.0e
+ * Reference Specs: http://www.nvmexpress.org, 1.4, 1.3, 1.2, 1.1, 1.0e
  *
  *  https://nvmexpress.org/developers/nvme-specification/
  */
@@ -22,8 +22,9 @@
  *  [pmrdev=,] \
  *  max_ioqpairs=, \
  *  aerl=, aer_max_queued=, \
- *  mdts=
- *  -device nvme-ns,drive=,bus=bus_name,nsid=
+ *  mdts=,zoned.append_size_limit= \
+ *  -device nvme-ns,drive=,bus=,nsid=,\
+ *  zoned=
  *
  * Note cmb_size_mb denotes size of CMB in MB. CMB is assumed to be at
  * offset 0 in BAR2 and supports only WDS, RDS and SQS for now.
@@ -41,14 +42,50 @@
  * ~~
  * - `aerl`
  *   The Asynchronous Event Request Limit (AERL). Indicates the maximum number
- *   of concurrently outstanding Asynchronous Event Request commands suppoert
+ *   of concurrently outstanding Asynchronous Event Request commands support
  *   by the controller. This is a 0's based value.
  *
  * - `aer_max_queued`
  *   This is the maximum number of events that the device will enqueue for
- *   completion when there are no oustanding AERs. When the maximum number of
+ *   completion when there are no outstanding AERs. When the maximum number of
  *   enqueued events are reached, subsequent events will be dropped.
  *
+ * - `zoned.append_size_limit`
+ *   The maximum I/O size in bytes that is allowed in Zone Append command.
+ *   The default is 128KiB. Since internally this this value is maintained as
+ *   ZASL = log2( / ), some values assigned
+ *   to this property may be rounded down and result in a lower maximum ZA
+ *   data size being in effect. By setting this property to 0, users can make
+ *   ZASL to be equal to MDTS. This property only affects zoned namespaces.
+ *
+ * Setting `zoned` to true selects Zoned Command Set at the namespace.
+ * In this case, the following namespace properties are available to configure
+ * zoned operation:
+ * zoned.zsze=
+ * The number may be followed by K, M, G as in kilo-, mega- or giga-.
+ *
+ * zoned.zcap=
+ * The value 0 (default) forces zone capacity to be the same as zone
+ * size. The value of this property may not exceed zone size.
+ *
+ * zoned.descr_ext_size=
+ * This value needs to be specified in 64B units. If it is zero,
+ * namespace(s) will not support zone descriptor extensions.
+ *
+ * zoned.max_active=
+ * The default value means there is no limit to the number of
+ * concurrently active zones.
+ *
+ * zoned.max_open=
+ * The default value means there is no limit to the number of
+ * concurrently open zones.
+ *
+ * zoned.offline_zones=
+ *
+ * zoned.rdonly_zones=
+ *
+ * zoned.cross_zone_read=
+ * Setting this property to true enables Read Across Zone Boundaries.
  */
 
 #include "qemu/osdep.h"
-- 
2.28.0

[PATCH v11 12/13] hw/block/nvme: Add injection of Offline/Read-Only zones

2020-12-08 Thread Dmitry Fomichev

ZNS specification defines two zone conditions for the zones that no
longer can function properly, possibly because of flash wear or other
internal fault. It is useful to be able to "inject" a small number of
such zones for testing purposes.

This commit defines two optional device properties, "offline_zones"
and "rdonly_zones". Users can assign non-zero values to these variables
to specify the number of zones to be initialized as Offline or
Read-Only. The actual number of injected zones may be smaller than the
requested amount - Read-Only and Offline counts are expected to be much
smaller than the total number of zones on a drive.

Signed-off-by: Dmitry Fomichev 
Reviewed-by: Niklas Cassel 
---
 hw/block/nvme-ns.h |  2 ++
 hw/block/nvme-ns.c | 53 ++
 2 files changed, 55 insertions(+)

diff --git a/hw/block/nvme-ns.h b/hw/block/nvme-ns.h
index f8f3c28c36..1196865b7a 100644
--- a/hw/block/nvme-ns.h
+++ b/hw/block/nvme-ns.h
@@ -36,6 +36,8 @@ typedef struct NvmeNamespaceParams {
 uint32_t max_active_zones;
 uint32_t max_open_zones;
 uint32_t zd_extension_size;
+uint32_t nr_offline_zones;
+uint32_t nr_rdonly_zones;
 } NvmeNamespaceParams;
 
 typedef struct NvmeNamespace {
diff --git a/hw/block/nvme-ns.c b/hw/block/nvme-ns.c
index c5a7bafcf7..0a8b741bc9 100644
--- a/hw/block/nvme-ns.c
+++ b/hw/block/nvme-ns.c
@@ -21,6 +21,7 @@
 #include "sysemu/sysemu.h"
 #include "sysemu/block-backend.h"
 #include "qapi/error.h"
+#include "crypto/random.h"
 
 #include "hw/qdev-properties.h"
 #include "hw/qdev-core.h"
@@ -163,6 +164,21 @@ static int nvme_ns_zoned_check_calc_geometry(NvmeNamespace 
*ns, Error **errp)
 }
 }
 
+if (ns->params.max_open_zones < ns->num_zones) {
+if (ns->params.nr_offline_zones >
+ns->num_zones - ns->params.max_open_zones) {
+error_setg(errp, "offline_zones value %u is too large",
+ns->params.nr_offline_zones);
+return -1;
+}
+if (ns->params.nr_rdonly_zones + ns->params.nr_offline_zones >
+ns->num_zones - ns->params.max_open_zones) {
+error_setg(errp, "rdonly_zones value %u is too large",
+ns->params.nr_rdonly_zones);
+return -1;
+}
+}
+
 return 0;
 }
 
@@ -171,7 +187,9 @@ static void nvme_ns_zoned_init_state(NvmeNamespace *ns)
 uint64_t start = 0, zone_size = ns->zone_size;
 uint64_t capacity = ns->num_zones * zone_size;
 NvmeZone *zone;
+uint32_t rnd;
 int i;
+uint16_t zs;
 
 ns->zone_array = g_new0(NvmeZone, ns->num_zones);
 if (ns->params.zd_extension_size) {
@@ -203,6 +221,37 @@ static void nvme_ns_zoned_init_state(NvmeNamespace *ns)
 if (is_power_of_2(ns->zone_size)) {
 ns->zone_size_log2 = 63 - clz64(ns->zone_size);
 }
+
+/* If required, make some zones Offline or Read Only */
+
+for (i = 0; i < ns->params.nr_offline_zones; i++) {
+do {
+qcrypto_random_bytes(, sizeof(rnd), NULL);
+rnd %= ns->num_zones;
+} while (rnd < ns->params.max_open_zones);
+zone = >zone_array[rnd];
+zs = nvme_get_zone_state(zone);
+if (zs != NVME_ZONE_STATE_OFFLINE) {
+nvme_set_zone_state(zone, NVME_ZONE_STATE_OFFLINE);
+} else {
+i--;
+}
+}
+
+for (i = 0; i < ns->params.nr_rdonly_zones; i++) {
+do {
+qcrypto_random_bytes(, sizeof(rnd), NULL);
+rnd %= ns->num_zones;
+} while (rnd < ns->params.max_open_zones);
+zone = >zone_array[rnd];
+zs = nvme_get_zone_state(zone);
+if (zs != NVME_ZONE_STATE_OFFLINE &&
+zs != NVME_ZONE_STATE_READ_ONLY) {
+nvme_set_zone_state(zone, NVME_ZONE_STATE_READ_ONLY);
+} else {
+i--;
+}
+}
 }
 
 static void nvme_ns_init_zoned(NvmeCtrl *n, NvmeNamespace *ns, int lba_index)
@@ -368,6 +417,10 @@ static Property nvme_ns_props[] = {
params.max_open_zones, 0),
 DEFINE_PROP_UINT32("zoned.descr_ext_size", NvmeNamespace,
params.zd_extension_size, 0),
+DEFINE_PROP_UINT32("zoned.offline_zones", NvmeNamespace,
+   params.nr_offline_zones, 0),
+DEFINE_PROP_UINT32("zoned.rdonly_zones", NvmeNamespace,
+   params.nr_rdonly_zones, 0),
 DEFINE_PROP_END_OF_LIST(),
 };
 
-- 
2.28.0

[PATCH v11 10/13] hw/block/nvme: Introduce max active and open zone limits

2020-12-08 Thread Dmitry Fomichev

Add two module properties, "zoned.max_active" and "zoned.max_open"
to control the maximum number of zones that can be active or open.
Once these variables are set to non-default values, these limits are
checked during I/O and Too Many Active or Too Many Open command status
is returned if they are exceeded.

Signed-off-by: Hans Holmberg 
Signed-off-by: Dmitry Fomichev 
Reviewed-by: Niklas Cassel 
---
 hw/block/nvme-ns.h| 41 +++
 hw/block/nvme-ns.c| 31 ++-
 hw/block/nvme.c   | 92 +++
 hw/block/trace-events |  2 +
 4 files changed, 164 insertions(+), 2 deletions(-)

diff --git a/hw/block/nvme-ns.h b/hw/block/nvme-ns.h
index 388381dda0..7e1fd26909 100644
--- a/hw/block/nvme-ns.h
+++ b/hw/block/nvme-ns.h
@@ -33,6 +33,8 @@ typedef struct NvmeNamespaceParams {
 bool cross_zone_read;
 uint64_t zone_size_bs;
 uint64_t zone_cap_bs;
+uint32_t max_active_zones;
+uint32_t max_open_zones;
 } NvmeNamespaceParams;
 
 typedef struct NvmeNamespace {
@@ -54,6 +56,8 @@ typedef struct NvmeNamespace {
 uint64_tzone_size;
 uint64_tzone_capacity;
 uint32_tzone_size_log2;
+int32_t nr_open_zones;
+int32_t nr_active_zones;
 
 NvmeNamespaceParams params;
 
@@ -125,6 +129,43 @@ static inline bool nvme_wp_is_valid(NvmeZone *zone)
st != NVME_ZONE_STATE_OFFLINE;
 }
 
+static inline void nvme_aor_inc_open(NvmeNamespace *ns)
+{
+assert(ns->nr_open_zones >= 0);
+if (ns->params.max_open_zones) {
+ns->nr_open_zones++;
+assert(ns->nr_open_zones <= ns->params.max_open_zones);
+}
+}
+
+static inline void nvme_aor_dec_open(NvmeNamespace *ns)
+{
+if (ns->params.max_open_zones) {
+assert(ns->nr_open_zones > 0);
+ns->nr_open_zones--;
+}
+assert(ns->nr_open_zones >= 0);
+}
+
+static inline void nvme_aor_inc_active(NvmeNamespace *ns)
+{
+assert(ns->nr_active_zones >= 0);
+if (ns->params.max_active_zones) {
+ns->nr_active_zones++;
+assert(ns->nr_active_zones <= ns->params.max_active_zones);
+}
+}
+
+static inline void nvme_aor_dec_active(NvmeNamespace *ns)
+{
+if (ns->params.max_active_zones) {
+assert(ns->nr_active_zones > 0);
+ns->nr_active_zones--;
+assert(ns->nr_active_zones >= ns->nr_open_zones);
+}
+assert(ns->nr_active_zones >= 0);
+}
+
 int nvme_ns_setup(NvmeCtrl *n, NvmeNamespace *ns, Error **errp);
 void nvme_ns_drain(NvmeNamespace *ns);
 void nvme_ns_shutdown(NvmeNamespace *ns);
diff --git a/hw/block/nvme-ns.c b/hw/block/nvme-ns.c
index 1df45bbe35..aaef69fb47 100644
--- a/hw/block/nvme-ns.c
+++ b/hw/block/nvme-ns.c
@@ -136,6 +136,21 @@ static int nvme_ns_zoned_check_calc_geometry(NvmeNamespace 
*ns, Error **errp)
 ns->zone_size = zone_size / lbasz;
 ns->zone_capacity = zone_cap / lbasz;
 ns->num_zones = ns->size / lbasz / ns->zone_size;
+
+/* Do a few more sanity checks of ZNS properties */
+if (ns->params.max_open_zones > ns->num_zones) {
+error_setg(errp,
+   "max_open_zones value %u exceeds the number of zones %u",
+   ns->params.max_open_zones, ns->num_zones);
+return -1;
+}
+if (ns->params.max_active_zones > ns->num_zones) {
+error_setg(errp,
+   "max_active_zones value %u exceeds the number of zones %u",
+   ns->params.max_active_zones, ns->num_zones);
+return -1;
+}
+
 return 0;
 }
 
@@ -183,8 +198,8 @@ static void nvme_ns_init_zoned(NvmeCtrl *n, NvmeNamespace 
*ns, int lba_index)
 id_ns_z = g_malloc0(sizeof(NvmeIdNsZoned));
 
 /* MAR/MOR are zeroes-based, 0x means no limit */
-id_ns_z->mar = 0x;
-id_ns_z->mor = 0x;
+id_ns_z->mar = cpu_to_le32(ns->params.max_active_zones - 1);
+id_ns_z->mor = cpu_to_le32(ns->params.max_open_zones - 1);
 id_ns_z->zoc = 0;
 id_ns_z->ozcs = ns->params.cross_zone_read ? 0x01 : 0x00;
 
@@ -210,6 +225,7 @@ static void nvme_clear_zone(NvmeNamespace *ns, NvmeZone 
*zone)
 trace_pci_nvme_clear_ns_close(state, zone->d.zslba);
 nvme_set_zone_state(zone, NVME_ZONE_STATE_CLOSED);
 }
+nvme_aor_inc_active(ns);
 QTAILQ_INSERT_HEAD(>closed_zones, zone, entry);
 } else {
 trace_pci_nvme_clear_ns_reset(state, zone->d.zslba);
@@ -226,16 +242,23 @@ static void nvme_zoned_ns_shutdown(NvmeNamespace *ns)
 
 QTAILQ_FOREACH_SAFE(zone, >closed_zones, entry, next) {
 QTAILQ_REMOVE(>closed_zones, zone, entry);
+nvme_aor_dec_active(ns);
 nvme_clear_zone(ns, zone);
 }
 QTAILQ_FOREACH_SAFE(zone, >imp_open_zones, entry, next) {
 QTAILQ_REMOVE(>imp_open_zones, zone, entry);
+nvme_aor_dec_open(ns);
+nvme_aor_dec_active(ns);
 nvme_clear_zone(ns, zone);
 }
 QTAILQ_FOREACH_SAFE(zone, >exp_open_zones, entry, next) {

[PATCH v11 11/13] hw/block/nvme: Support Zone Descriptor Extensions

2020-12-08 Thread Dmitry Fomichev

Zone Descriptor Extension is a label that can be assigned to a zone.
It can be set to an Empty zone and it stays assigned until the zone
is reset.

This commit adds a new optional module property,
"zoned.descr_ext_size". Its value must be a multiple of 64 bytes.
If this value is non-zero, it becomes possible to assign extensions
of that size to any Empty zones. The default value for this property
is 0, therefore setting extensions is disabled by default.

Signed-off-by: Hans Holmberg 
Signed-off-by: Dmitry Fomichev 
Reviewed-by: Klaus Jensen 
Reviewed-by: Niklas Cassel 
---
 hw/block/nvme-ns.h|  8 +++
 hw/block/nvme-ns.c| 25 ++--
 hw/block/nvme.c   | 53 +--
 hw/block/trace-events |  2 ++
 4 files changed, 84 insertions(+), 4 deletions(-)

diff --git a/hw/block/nvme-ns.h b/hw/block/nvme-ns.h
index 7e1fd26909..f8f3c28c36 100644
--- a/hw/block/nvme-ns.h
+++ b/hw/block/nvme-ns.h
@@ -35,6 +35,7 @@ typedef struct NvmeNamespaceParams {
 uint64_t zone_cap_bs;
 uint32_t max_active_zones;
 uint32_t max_open_zones;
+uint32_t zd_extension_size;
 } NvmeNamespaceParams;
 
 typedef struct NvmeNamespace {
@@ -56,6 +57,7 @@ typedef struct NvmeNamespace {
 uint64_tzone_size;
 uint64_tzone_capacity;
 uint32_tzone_size_log2;
+uint8_t *zd_extensions;
 int32_t nr_open_zones;
 int32_t nr_active_zones;
 
@@ -129,6 +131,12 @@ static inline bool nvme_wp_is_valid(NvmeZone *zone)
st != NVME_ZONE_STATE_OFFLINE;
 }
 
+static inline uint8_t *nvme_get_zd_extension(NvmeNamespace *ns,
+ uint32_t zone_idx)
+{
+return >zd_extensions[zone_idx * ns->params.zd_extension_size];
+}
+
 static inline void nvme_aor_inc_open(NvmeNamespace *ns)
 {
 assert(ns->nr_open_zones >= 0);
diff --git a/hw/block/nvme-ns.c b/hw/block/nvme-ns.c
index aaef69fb47..c5a7bafcf7 100644
--- a/hw/block/nvme-ns.c
+++ b/hw/block/nvme-ns.c
@@ -151,6 +151,18 @@ static int nvme_ns_zoned_check_calc_geometry(NvmeNamespace 
*ns, Error **errp)
 return -1;
 }
 
+if (ns->params.zd_extension_size) {
+if (ns->params.zd_extension_size & 0x3f) {
+error_setg(errp,
+"zone descriptor extension size must be a multiple of 64B");
+return -1;
+}
+if ((ns->params.zd_extension_size >> 6) > 0xff) {
+error_setg(errp, "zone descriptor extension size is too large");
+return -1;
+}
+}
+
 return 0;
 }
 
@@ -162,6 +174,10 @@ static void nvme_ns_zoned_init_state(NvmeNamespace *ns)
 int i;
 
 ns->zone_array = g_new0(NvmeZone, ns->num_zones);
+if (ns->params.zd_extension_size) {
+ns->zd_extensions = g_malloc0(ns->params.zd_extension_size *
+  ns->num_zones);
+}
 
 QTAILQ_INIT(>exp_open_zones);
 QTAILQ_INIT(>imp_open_zones);
@@ -204,7 +220,8 @@ static void nvme_ns_init_zoned(NvmeCtrl *n, NvmeNamespace 
*ns, int lba_index)
 id_ns_z->ozcs = ns->params.cross_zone_read ? 0x01 : 0x00;
 
 id_ns_z->lbafe[lba_index].zsze = cpu_to_le64(ns->zone_size);
-id_ns_z->lbafe[lba_index].zdes = 0;
+id_ns_z->lbafe[lba_index].zdes =
+ns->params.zd_extension_size >> 6; /* Units of 64B */
 
 ns->csi = NVME_CSI_ZONED;
 ns->id_ns.nsze = cpu_to_le64(ns->num_zones * ns->zone_size);
@@ -220,7 +237,8 @@ static void nvme_clear_zone(NvmeNamespace *ns, NvmeZone 
*zone)
 
 zone->w_ptr = zone->d.wp;
 state = nvme_get_zone_state(zone);
-if (zone->d.wp != zone->d.zslba) {
+if (zone->d.wp != zone->d.zslba ||
+(zone->d.za & NVME_ZA_ZD_EXT_VALID)) {
 if (state != NVME_ZONE_STATE_CLOSED) {
 trace_pci_nvme_clear_ns_close(state, zone->d.zslba);
 nvme_set_zone_state(zone, NVME_ZONE_STATE_CLOSED);
@@ -316,6 +334,7 @@ void nvme_ns_cleanup(NvmeNamespace *ns)
 if (ns->params.zoned) {
 g_free(ns->id_ns_zoned);
 g_free(ns->zone_array);
+g_free(ns->zd_extensions);
 }
 }
 
@@ -347,6 +366,8 @@ static Property nvme_ns_props[] = {
params.max_active_zones, 0),
 DEFINE_PROP_UINT32("zoned.max_open", NvmeNamespace,
params.max_open_zones, 0),
+DEFINE_PROP_UINT32("zoned.descr_ext_size", NvmeNamespace,
+   params.zd_extension_size, 0),
 DEFINE_PROP_END_OF_LIST(),
 };
 
diff --git a/hw/block/nvme.c b/hw/block/nvme.c
index 8b97b713a3..c2336bfd67 100644
--- a/hw/block/nvme.c
+++ b/hw/block/nvme.c
@@ -1715,6 +1715,25 @@ static uint16_t nvme_offline_zone(NvmeNamespace *ns, 
NvmeZone *zone,
 }
 }
 
+static uint16_t nvme_set_zd_ext(NvmeNamespace *ns, NvmeZone *zone)
+{
+uint16_t status;
+uint8_t state = nvme_get_zone_state(zone);
+
+if (state == NVME_ZONE_STATE_EMPTY) {
+status = nvme_aor_check(ns, 1, 0);
+if (status != NVME_SUCCESS) {
+

[PATCH v11 00/13] hw/block/nvme: Support Namespace Types and Zoned Namespace Command Set

2020-12-08 Thread Dmitry Fomichev

v10 -> v11:

 - Address review comments by Klaus.

 - Add a patch to separate the handling of controller reset
   and subsystem shutdown. Place the patch at the beginning
   of the series so it can be picked up separately.

 - Rebase on the current nvme-next branch.

v9 -> v10:

 - Correctly check for MDTS in Zone Management Receive handler.

 - Change Klaus' "Reviewed-by" email in UUID patch.

v8 -> v9:

 - Move the modifications to "include/block/nvme.h" made to
   introduce ZNS-related definitions to a separate patch.

 - Add a new struct, NvmeZonedResult, along the same lines as the
   existing NvmeAerResult, to carry Zone Append LBA returned to
   the host. Now, there is no need to modify NvmeCqe struct except
   renaming DW1 field from "rsvd" to "dw1".

 - Add check for MDTS in Zone Management Receive handler.

 - Remove checks for ns->attached since the value of this flag
   is always true for now.

 - Rebase to the current quemu-nvme/nvme-next branch.

v7 -> v8:

 - Move refactoring commits to the front of the series.

 - Remove "attached" and "fill_pattern" device properties.

 - Only close open zones upon subsystem shutdown, not when CC.EN flag
   is set to 0. Avoid looping through all zones by iterating through
   lists of open and closed zones.

 - Improve bulk processing of zones aka zoned operations with "all"
   flag set. Avoid looping through the entire zone array for all zone
   operations except Offline Zone.

 - Prefix ZNS-related property names with "zoned.". The "zoned" Boolean
   property is retained to turn on zoned command set as it is much more
   intuitive and user-friendly compared to setting a magic number value
   to csi property.

 - Address review comments.

 - Remove unused trace events.

v6 -> v7:

 - Introduce ns->iocs initialization function earlier in the series,
   in CSE Log patch.

 - Set NVM iocs for zoned namespaces when CC.CSS is set to
   NVME_CC_CSS_NVM.

 - Clean up code in CSE log handler.
 
v5 -> v6:

 - Remove zoned state persistence code. Replace position-independent
   zone lists with QTAILQs.

 - Close all open zones upon clearing of the controller. This is
   a similar procedure to the one previously performed upon powering
   up with zone persistence. 

 - Squash NS Types and ZNS triplets of commits to keep definitions
   and trace event definitions together with the implementation code.

 - Move namespace UUID generation to a separate patch. Add the new
   "uuid" property as suggested by Klaus.

 - Rework Commands and Effects patch to make sure that the log is
   always in sync with the actual set of commands supported.

 - Add two refactoring commits at the end of the series to
   optimize read and write i/o path.

- Incorporate feedback from Keith, Klaus and Niklas:

  * fix rebase errors in nvme_identify_ns_descr_list()
  * remove unnecessary code from nvme_write_bar()
  * move csi to NvmeNamespace and use it from the beginning in NSTypes
patch
  * change zone read processing to cover all corner cases with RAZB=1
  * sync w_ptr and d.wp in case of a i/o error at the preceding zone
  * reword the commit message in active/inactive patch with the new
text from Niklas
  * correct dlfeat reporting depending on the fill pattern set
  * add more checks for "attached" n/s parameter to prevent i/o and
get/set features on inactive namespaces
  * Use DEFINE_PROP_SIZE and DEFINE_PROP_SIZE32 for zone size/capacity
and ZASL respectively
  * Improve zone size and capacity validation
  * Correctly report NSZE

v4 -> v5:

 - Rebase to the current qemu-nvme.

 - Use HostMemoryBackendFile as the backing storage for persistent
   zone metadata.

 - Fix the issue with filling the valid data in the next zone if RAZB
   is enabled.

v3 -> v4:

 - Fix bugs introduced in v2/v3 for QD > 1 operation. Now, all writes
   to a zone happen at the new write pointer variable, zone->w_ptr,
   that is advanced right after submitting the backend i/o. The existing
   zone->d.wp variable is updated upon the successful write completion
   and it is used for zone reporting. Some code has been split from
   nvme_finalize_zoned_write() function to a new function,
   nvme_advance_zone_wp().

 - Make the code compile under mingw. Switch to using QEMU API for
   mmap/msync, i.e. memory_region...(). Since mmap is not available in
   mingw (even though there is mman-win32 library available on Github),
   conditional compilation is added around these calls to avoid
   undefined symbols under mingw. A better fix would be to add stub
   functions to softmmu/memory.c for the case when CONFIG_POSIX is not
   defined, but such change is beyond the scope of this patchset and it
   can be made in a separate patch.

 - Correct permission mask used to open zone metadata file.

 - Fold "Define 64 bit cqe.result" patch into ZNS commit.

 - Use clz64/clz32 instead of defining nvme_ilog2() function.

 - Simplify rpt_empty_id_struct() code, move nvme_fill_data() back
   to ZNS patch.

 - Fix a

[PATCH v11 06/13] hw/block/nvme: Add support for Namespace Types

2020-12-08 Thread Dmitry Fomichev

From: Niklas Cassel 

Define the structures and constants required to implement
Namespace Types support.

Namespace Types introduce a new command set, "I/O Command Sets",
that allows the host to retrieve the command sets associated with
a namespace. Introduce support for the command set and enable
detection for the NVM Command Set.

The new workflows for identify commands rely heavily on zero-filled
identify structs. E.g., certain CNS commands are defined to return
a zero-filled identify struct when an inactive namespace NSID
is supplied.

Add a helper function in order to avoid code duplication when
reporting zero-filled identify structures.

Signed-off-by: Niklas Cassel 
Signed-off-by: Dmitry Fomichev 
Reviewed-by: Keith Busch 
---
 hw/block/nvme-ns.h|   1 +
 include/block/nvme.h  |  64 ++
 hw/block/nvme-ns.c|   2 +
 hw/block/nvme.c   | 188 +++---
 hw/block/trace-events |   6 ++
 5 files changed, 217 insertions(+), 44 deletions(-)

diff --git a/hw/block/nvme-ns.h b/hw/block/nvme-ns.h
index bdeaf1c0de..bdbc98c2ec 100644
--- a/hw/block/nvme-ns.h
+++ b/hw/block/nvme-ns.h
@@ -31,6 +31,7 @@ typedef struct NvmeNamespace {
 int64_t  size;
 NvmeIdNs id_ns;
 const uint32_t *iocs;
+uint8_t  csi;
 
 NvmeNamespaceParams params;
 
diff --git a/include/block/nvme.h b/include/block/nvme.h
index 422c98a297..890977db4b 100644
--- a/include/block/nvme.h
+++ b/include/block/nvme.h
@@ -84,6 +84,7 @@ enum NvmeCapMask {
 
 enum NvmeCapCss {
 NVME_CAP_CSS_NVM= 1 << 0,
+NVME_CAP_CSS_CSI_SUPP   = 1 << 6,
 NVME_CAP_CSS_ADMIN_ONLY = 1 << 7,
 };
 
@@ -117,9 +118,25 @@ enum NvmeCcMask {
 
 enum NvmeCcCss {
 NVME_CC_CSS_NVM= 0x0,
+NVME_CC_CSS_CSI= 0x6,
 NVME_CC_CSS_ADMIN_ONLY = 0x7,
 };
 
+#define NVME_SET_CC_EN(cc, val) \
+(cc |= (uint32_t)((val) & CC_EN_MASK) << CC_EN_SHIFT)
+#define NVME_SET_CC_CSS(cc, val)\
+(cc |= (uint32_t)((val) & CC_CSS_MASK) << CC_CSS_SHIFT)
+#define NVME_SET_CC_MPS(cc, val)\
+(cc |= (uint32_t)((val) & CC_MPS_MASK) << CC_MPS_SHIFT)
+#define NVME_SET_CC_AMS(cc, val)\
+(cc |= (uint32_t)((val) & CC_AMS_MASK) << CC_AMS_SHIFT)
+#define NVME_SET_CC_SHN(cc, val)\
+(cc |= (uint32_t)((val) & CC_SHN_MASK) << CC_SHN_SHIFT)
+#define NVME_SET_CC_IOSQES(cc, val) \
+(cc |= (uint32_t)((val) & CC_IOSQES_MASK) << CC_IOSQES_SHIFT)
+#define NVME_SET_CC_IOCQES(cc, val) \
+(cc |= (uint32_t)((val) & CC_IOCQES_MASK) << CC_IOCQES_SHIFT)
+
 enum NvmeCstsShift {
 CSTS_RDY_SHIFT  = 0,
 CSTS_CFS_SHIFT  = 1,
@@ -534,8 +551,13 @@ typedef struct QEMU_PACKED NvmeIdentify {
 uint64_trsvd2[2];
 uint64_tprp1;
 uint64_tprp2;
-uint32_tcns;
-uint32_trsvd11[5];
+uint8_t cns;
+uint8_t rsvd10;
+uint16_tctrlid;
+uint16_tnvmsetid;
+uint8_t rsvd11;
+uint8_t csi;
+uint32_trsvd12[4];
 } NvmeIdentify;
 
 typedef struct QEMU_PACKED NvmeRwCmd {
@@ -656,6 +678,7 @@ enum NvmeStatusCodes {
 NVME_SGL_DESCR_TYPE_INVALID = 0x0011,
 NVME_INVALID_USE_OF_CMB = 0x0012,
 NVME_INVALID_PRP_OFFSET = 0x0013,
+NVME_CMD_SET_CMB_REJECTED   = 0x002b,
 NVME_LBA_RANGE  = 0x0080,
 NVME_CAP_EXCEEDED   = 0x0081,
 NVME_NS_NOT_READY   = 0x0082,
@@ -783,11 +806,15 @@ typedef struct QEMU_PACKED NvmePSD {
 
 #define NVME_IDENTIFY_DATA_SIZE 4096
 
-enum {
-NVME_ID_CNS_NS = 0x0,
-NVME_ID_CNS_CTRL   = 0x1,
-NVME_ID_CNS_NS_ACTIVE_LIST = 0x2,
-NVME_ID_CNS_NS_DESCR_LIST  = 0x3,
+enum NvmeIdCns {
+NVME_ID_CNS_NS= 0x00,
+NVME_ID_CNS_CTRL  = 0x01,
+NVME_ID_CNS_NS_ACTIVE_LIST= 0x02,
+NVME_ID_CNS_NS_DESCR_LIST = 0x03,
+NVME_ID_CNS_CS_NS = 0x05,
+NVME_ID_CNS_CS_CTRL   = 0x06,
+NVME_ID_CNS_CS_NS_ACTIVE_LIST = 0x07,
+NVME_ID_CNS_IO_COMMAND_SET= 0x1c,
 };
 
 typedef struct QEMU_PACKED NvmeIdCtrl {
@@ -938,6 +965,7 @@ enum NvmeFeatureIds {
 NVME_WRITE_ATOMICITY= 0xa,
 NVME_ASYNCHRONOUS_EVENT_CONF= 0xb,
 NVME_TIMESTAMP  = 0xe,
+NVME_COMMAND_SET_PROFILE= 0x19,
 NVME_SOFTWARE_PROGRESS_MARKER   = 0x80,
 NVME_FID_MAX= 0x100,
 };
@@ -1027,18 +1055,26 @@ typedef struct QEMU_PACKED NvmeIdNsDescr {
 uint8_t rsvd2[2];
 } NvmeIdNsDescr;
 
-enum {
-NVME_NIDT_EUI64_LEN =  8,
-NVME_NIDT_NGUID_LEN = 16,
-NVME_NIDT_UUID_LEN  = 16,
+enum NvmeNsIdentifierLength {
+NVME_NIDL_EUI64 = 8,
+NVME_NIDL_NGUID = 16,
+NVME_NIDL_UUID  = 16,
+NVME_NIDL_CSI   = 1,
 };
 
 enum NvmeNsIdentifierType {
-NVME_NIDT_EUI64 = 0x1,
-NVME_NIDT_NGUID = 0x2,
-NVME_NIDT_UUID  = 0x3,
+NVME_NIDT_EUI64 = 0x01,
+NVME_NIDT_NGUID = 0x02,
+NVME_NIDT_UUID

[PATCH v11 08/13] block/nvme: Make ZNS-related definitions

2020-12-08 Thread Dmitry Fomichev

Define values and structures that are needed to support Zoned
Namespace Command Set (NVMe TP 4053).

Signed-off-by: Dmitry Fomichev 
---
 include/block/nvme.h | 114 ++-
 1 file changed, 113 insertions(+), 1 deletion(-)

diff --git a/include/block/nvme.h b/include/block/nvme.h
index 29d826ab19..a9165402d6 100644
--- a/include/block/nvme.h
+++ b/include/block/nvme.h
@@ -489,6 +489,9 @@ enum NvmeIoCommands {
 NVME_CMD_COMPARE= 0x05,
 NVME_CMD_WRITE_ZEROES   = 0x08,
 NVME_CMD_DSM= 0x09,
+NVME_CMD_ZONE_MGMT_SEND = 0x79,
+NVME_CMD_ZONE_MGMT_RECV = 0x7a,
+NVME_CMD_ZONE_APPEND= 0x7d,
 };
 
 typedef struct QEMU_PACKED NvmeDeleteQ {
@@ -648,9 +651,13 @@ typedef struct QEMU_PACKED NvmeAerResult {
 uint8_t resv;
 } NvmeAerResult;
 
+typedef struct QEMU_PACKED NvmeZonedResult {
+uint64_t slba;
+} NvmeZonedResult;
+
 typedef struct QEMU_PACKED NvmeCqe {
 uint32_tresult;
-uint32_trsvd;
+uint32_tdw1;
 uint16_tsq_head;
 uint16_tsq_id;
 uint16_tcid;
@@ -679,6 +686,7 @@ enum NvmeStatusCodes {
 NVME_INVALID_USE_OF_CMB = 0x0012,
 NVME_INVALID_PRP_OFFSET = 0x0013,
 NVME_CMD_SET_CMB_REJECTED   = 0x002b,
+NVME_INVALID_CMD_SET= 0x002c,
 NVME_LBA_RANGE  = 0x0080,
 NVME_CAP_EXCEEDED   = 0x0081,
 NVME_NS_NOT_READY   = 0x0082,
@@ -703,6 +711,14 @@ enum NvmeStatusCodes {
 NVME_CONFLICTING_ATTRS  = 0x0180,
 NVME_INVALID_PROT_INFO  = 0x0181,
 NVME_WRITE_TO_RO= 0x0182,
+NVME_ZONE_BOUNDARY_ERROR= 0x01b8,
+NVME_ZONE_FULL  = 0x01b9,
+NVME_ZONE_READ_ONLY = 0x01ba,
+NVME_ZONE_OFFLINE   = 0x01bb,
+NVME_ZONE_INVALID_WRITE = 0x01bc,
+NVME_ZONE_TOO_MANY_ACTIVE   = 0x01bd,
+NVME_ZONE_TOO_MANY_OPEN = 0x01be,
+NVME_ZONE_INVAL_TRANSITION  = 0x01bf,
 NVME_WRITE_FAULT= 0x0280,
 NVME_UNRECOVERED_READ   = 0x0281,
 NVME_E2E_GUARD_ERROR= 0x0282,
@@ -888,6 +904,11 @@ typedef struct QEMU_PACKED NvmeIdCtrl {
 uint8_t vs[1024];
 } NvmeIdCtrl;
 
+typedef struct NvmeIdCtrlZoned {
+uint8_t zasl;
+uint8_t rsvd1[4095];
+} NvmeIdCtrlZoned;
+
 enum NvmeIdCtrlOacs {
 NVME_OACS_SECURITY  = 1 << 0,
 NVME_OACS_FORMAT= 1 << 1,
@@ -1016,6 +1037,12 @@ typedef struct QEMU_PACKED NvmeLBAF {
 uint8_t rp;
 } NvmeLBAF;
 
+typedef struct QEMU_PACKED NvmeLBAFE {
+uint64_tzsze;
+uint8_t zdes;
+uint8_t rsvd9[7];
+} NvmeLBAFE;
+
 #define NVME_NSID_BROADCAST 0x
 
 typedef struct QEMU_PACKED NvmeIdNs {
@@ -1075,10 +1102,24 @@ enum NvmeNsIdentifierType {
 
 enum NvmeCsi {
 NVME_CSI_NVM= 0x00,
+NVME_CSI_ZONED  = 0x02,
 };
 
 #define NVME_SET_CSI(vec, csi) (vec |= (uint8_t)(1 << (csi)))
 
+typedef struct QEMU_PACKED NvmeIdNsZoned {
+uint16_tzoc;
+uint16_tozcs;
+uint32_tmar;
+uint32_tmor;
+uint32_trrl;
+uint32_tfrl;
+uint8_t rsvd20[2796];
+NvmeLBAFE   lbafe[16];
+uint8_t rsvd3072[768];
+uint8_t vs[256];
+} NvmeIdNsZoned;
+
 /*Deallocate Logical Block Features*/
 #define NVME_ID_NS_DLFEAT_GUARD_CRC(dlfeat)   ((dlfeat) & 0x10)
 #define NVME_ID_NS_DLFEAT_WRITE_ZEROES(dlfeat)((dlfeat) & 0x08)
@@ -,10 +1152,76 @@ enum NvmeIdNsDps {
 DPS_FIRST_EIGHT = 8,
 };
 
+enum NvmeZoneAttr {
+NVME_ZA_FINISHED_BY_CTLR = 1 << 0,
+NVME_ZA_FINISH_RECOMMENDED   = 1 << 1,
+NVME_ZA_RESET_RECOMMENDED= 1 << 2,
+NVME_ZA_ZD_EXT_VALID = 1 << 7,
+};
+
+typedef struct QEMU_PACKED NvmeZoneReportHeader {
+uint64_tnr_zones;
+uint8_t rsvd[56];
+} NvmeZoneReportHeader;
+
+enum NvmeZoneReceiveAction {
+NVME_ZONE_REPORT = 0,
+NVME_ZONE_REPORT_EXTENDED= 1,
+};
+
+enum NvmeZoneReportType {
+NVME_ZONE_REPORT_ALL = 0,
+NVME_ZONE_REPORT_EMPTY   = 1,
+NVME_ZONE_REPORT_IMPLICITLY_OPEN = 2,
+NVME_ZONE_REPORT_EXPLICITLY_OPEN = 3,
+NVME_ZONE_REPORT_CLOSED  = 4,
+NVME_ZONE_REPORT_FULL= 5,
+NVME_ZONE_REPORT_READ_ONLY   = 6,
+NVME_ZONE_REPORT_OFFLINE = 7,
+};
+
+enum NvmeZoneType {
+NVME_ZONE_TYPE_RESERVED  = 0x00,
+NVME_ZONE_TYPE_SEQ_WRITE = 0x02,
+};
+
+enum NvmeZoneSendAction {
+NVME_ZONE_ACTION_RSD = 0x00,
+NVME_ZONE_ACTION_CLOSE   = 0x01,
+NVME_ZONE_ACTION_FINISH  = 0x02,
+NVME_ZONE_ACTION_OPEN= 0x03,
+NVME_ZONE_ACTION_RESET   = 0x04,
+NVME_ZONE_ACTION_OFFLINE = 0x05,
+NVME_ZONE_ACTION_SET_ZD_EXT  = 0x10,
+};
+
+typedef struct QEMU_PACKED NvmeZoneDescr {
+uint8_t zt;
+uint8_t zs;
+uint8_t za;
+uint8_t rsvd3[5];
+uint64_tzcap;
+

[PATCH v11 05/13] hw/block/nvme: Add Commands Supported and Effects log

2020-12-08 Thread Dmitry Fomichev

This log page becomes necessary to implement to allow checking for
Zone Append command support in Zoned Namespace Command Set.

This commit adds the code to report this log page for NVM Command
Set only. The parts that are specific to zoned operation will be
added later in the series.

All incoming admin and i/o commands are now only processed if their
corresponding support bits are set in this log. This provides an
easy way to control what commands to support and what not to
depending on set CC.CSS.

Signed-off-by: Dmitry Fomichev 
Reviewed-by: Niklas Cassel 
---
 hw/block/nvme-ns.h|  1 +
 include/block/nvme.h  | 19 +
 hw/block/nvme.c   | 96 +++
 hw/block/trace-events |  1 +
 4 files changed, 108 insertions(+), 9 deletions(-)

diff --git a/hw/block/nvme-ns.h b/hw/block/nvme-ns.h
index aeca810fc7..bdeaf1c0de 100644
--- a/hw/block/nvme-ns.h
+++ b/hw/block/nvme-ns.h
@@ -30,6 +30,7 @@ typedef struct NvmeNamespace {
 int32_t  bootindex;
 int64_t  size;
 NvmeIdNs id_ns;
+const uint32_t *iocs;
 
 NvmeNamespaceParams params;
 
diff --git a/include/block/nvme.h b/include/block/nvme.h
index e95ff6ca9b..422c98a297 100644
--- a/include/block/nvme.h
+++ b/include/block/nvme.h
@@ -746,10 +746,27 @@ enum NvmeSmartWarn {
 NVME_SMART_FAILED_VOLATILE_MEDIA  = 1 << 4,
 };
 
+typedef struct NvmeEffectsLog {
+uint32_tacs[256];
+uint32_tiocs[256];
+uint8_t resv[2048];
+} NvmeEffectsLog;
+
+enum {
+NVME_CMD_EFF_CSUPP  = 1 << 0,
+NVME_CMD_EFF_LBCC   = 1 << 1,
+NVME_CMD_EFF_NCC= 1 << 2,
+NVME_CMD_EFF_NIC= 1 << 3,
+NVME_CMD_EFF_CCC= 1 << 4,
+NVME_CMD_EFF_CSE_MASK   = 3 << 16,
+NVME_CMD_EFF_UUID_SEL   = 1 << 19,
+};
+
 enum NvmeLogIdentifier {
 NVME_LOG_ERROR_INFO = 0x01,
 NVME_LOG_SMART_INFO = 0x02,
 NVME_LOG_FW_SLOT_INFO   = 0x03,
+NVME_LOG_CMD_EFFECTS= 0x05,
 };
 
 typedef struct QEMU_PACKED NvmePSD {
@@ -862,6 +879,7 @@ enum NvmeIdCtrlFrmw {
 
 enum NvmeIdCtrlLpa {
 NVME_LPA_NS_SMART = 1 << 0,
+NVME_LPA_CSE  = 1 << 1,
 NVME_LPA_EXTENDED = 1 << 2,
 };
 
@@ -1070,6 +1088,7 @@ static inline void _nvme_check_size(void)
 QEMU_BUILD_BUG_ON(sizeof(NvmeErrorLog) != 64);
 QEMU_BUILD_BUG_ON(sizeof(NvmeFwSlotInfoLog) != 512);
 QEMU_BUILD_BUG_ON(sizeof(NvmeSmartLog) != 512);
+QEMU_BUILD_BUG_ON(sizeof(NvmeEffectsLog) != 4096);
 QEMU_BUILD_BUG_ON(sizeof(NvmeIdCtrl) != 4096);
 QEMU_BUILD_BUG_ON(sizeof(NvmeIdNs) != 4096);
 QEMU_BUILD_BUG_ON(sizeof(NvmeSglDescriptor) != 16);
diff --git a/hw/block/nvme.c b/hw/block/nvme.c
index 986917dabf..0b047f2069 100644
--- a/hw/block/nvme.c
+++ b/hw/block/nvme.c
@@ -112,6 +112,28 @@ static const uint32_t nvme_feature_cap[NVME_FID_MAX] = {
 [NVME_TIMESTAMP]= NVME_FEAT_CAP_CHANGE,
 };
 
+static const uint32_t nvme_cse_acs[256] = {
+[NVME_ADM_CMD_DELETE_SQ]= NVME_CMD_EFF_CSUPP,
+[NVME_ADM_CMD_CREATE_SQ]= NVME_CMD_EFF_CSUPP,
+[NVME_ADM_CMD_GET_LOG_PAGE] = NVME_CMD_EFF_CSUPP,
+[NVME_ADM_CMD_DELETE_CQ]= NVME_CMD_EFF_CSUPP,
+[NVME_ADM_CMD_CREATE_CQ]= NVME_CMD_EFF_CSUPP,
+[NVME_ADM_CMD_IDENTIFY] = NVME_CMD_EFF_CSUPP,
+[NVME_ADM_CMD_ABORT]= NVME_CMD_EFF_CSUPP,
+[NVME_ADM_CMD_SET_FEATURES] = NVME_CMD_EFF_CSUPP,
+[NVME_ADM_CMD_GET_FEATURES] = NVME_CMD_EFF_CSUPP,
+[NVME_ADM_CMD_ASYNC_EV_REQ] = NVME_CMD_EFF_CSUPP,
+};
+
+static const uint32_t nvme_cse_iocs_none[256];
+
+static const uint32_t nvme_cse_iocs_nvm[256] = {
+[NVME_CMD_FLUSH]= NVME_CMD_EFF_CSUPP | NVME_CMD_EFF_LBCC,
+[NVME_CMD_WRITE_ZEROES] = NVME_CMD_EFF_CSUPP | NVME_CMD_EFF_LBCC,
+[NVME_CMD_WRITE]= NVME_CMD_EFF_CSUPP | NVME_CMD_EFF_LBCC,
+[NVME_CMD_READ] = NVME_CMD_EFF_CSUPP,
+};
+
 static void nvme_process_sq(void *opaque);
 
 static uint16_t nvme_cid(NvmeRequest *req)
@@ -1203,10 +1225,6 @@ static uint16_t nvme_io_cmd(NvmeCtrl *n, NvmeRequest 
*req)
 trace_pci_nvme_io_cmd(nvme_cid(req), nsid, nvme_sqid(req),
   req->cmd.opcode, nvme_io_opc_str(req->cmd.opcode));
 
-if (NVME_CC_CSS(n->bar.cc) == NVME_CC_CSS_ADMIN_ONLY) {
-return NVME_INVALID_OPCODE | NVME_DNR;
-}
-
 if (!nvme_nsid_valid(n, nsid)) {
 return NVME_INVALID_NSID | NVME_DNR;
 }
@@ -1216,6 +1234,11 @@ static uint16_t nvme_io_cmd(NvmeCtrl *n, NvmeRequest 
*req)
 return NVME_INVALID_FIELD | NVME_DNR;
 }
 
+if (!(req->ns->iocs[req->cmd.opcode] & NVME_CMD_EFF_CSUPP)) {
+trace_pci_nvme_err_invalid_opc(req->cmd.opcode);
+return NVME_INVALID_OPCODE | NVME_DNR;
+}
+
 switch (req->cmd.opcode) {
 case NVME_CMD_FLUSH:
 return nvme_flush(n, req);
@@ -1228,8 +1251,7 @@ static uint16_t nvme_io_cmd(NvmeCtrl *n, NvmeRequest *req)
 case NVME_CMD_DSM:

[PATCH v11 04/13] hw/block/nvme: Combine nvme_write_zeroes() and nvme_write()

2020-12-08 Thread Dmitry Fomichev

Move write processing to nvme_do_write() that now handles both WRITE
and WRITE ZEROES. Both nvme_write() and nvme_write_zeroes() become
inline helper functions.

Signed-off-by: Dmitry Fomichev 
Reviewed-by: Niklas Cassel 
Acked-by: Klaus Jensen 
---
 hw/block/nvme.c   | 78 ---
 hw/block/trace-events |  1 -
 2 files changed, 36 insertions(+), 43 deletions(-)

diff --git a/hw/block/nvme.c b/hw/block/nvme.c
index 897c2d04e5..986917dabf 100644
--- a/hw/block/nvme.c
+++ b/hw/block/nvme.c
@@ -1128,32 +1128,7 @@ invalid:
 return status | NVME_DNR;
 }
 
-static uint16_t nvme_write_zeroes(NvmeCtrl *n, NvmeRequest *req)
-{
-NvmeRwCmd *rw = (NvmeRwCmd *)>cmd;
-NvmeNamespace *ns = req->ns;
-uint64_t slba = le64_to_cpu(rw->slba);
-uint32_t nlb = (uint32_t)le16_to_cpu(rw->nlb) + 1;
-uint64_t offset = nvme_l2b(ns, slba);
-uint32_t count = nvme_l2b(ns, nlb);
-uint16_t status;
-
-trace_pci_nvme_write_zeroes(nvme_cid(req), nvme_nsid(ns), slba, nlb);
-
-status = nvme_check_bounds(n, ns, slba, nlb);
-if (status) {
-trace_pci_nvme_err_invalid_lba_range(slba, nlb, ns->id_ns.nsze);
-return status;
-}
-
-block_acct_start(blk_get_stats(req->ns->blkconf.blk), >acct, 0,
- BLOCK_ACCT_WRITE);
-req->aiocb = blk_aio_pwrite_zeroes(req->ns->blkconf.blk, offset, count,
-   BDRV_REQ_MAY_UNMAP, nvme_rw_cb, req);
-return NVME_NO_COMPLETE;
-}
-
-static uint16_t nvme_write(NvmeCtrl *n, NvmeRequest *req)
+static uint16_t nvme_do_write(NvmeCtrl *n, NvmeRequest *req, bool wrz)
 {
 NvmeRwCmd *rw = (NvmeRwCmd *)>cmd;
 NvmeNamespace *ns = req->ns;
@@ -1167,10 +1142,12 @@ static uint16_t nvme_write(NvmeCtrl *n, NvmeRequest 
*req)
 trace_pci_nvme_write(nvme_cid(req), nvme_io_opc_str(rw->opcode),
  nvme_nsid(ns), nlb, data_size, slba);
 
-status = nvme_check_mdts(n, data_size);
-if (status) {
-trace_pci_nvme_err_mdts(nvme_cid(req), data_size);
-goto invalid;
+if (!wrz) {
+status = nvme_check_mdts(n, data_size);
+if (status) {
+trace_pci_nvme_err_mdts(nvme_cid(req), data_size);
+goto invalid;
+}
 }
 
 status = nvme_check_bounds(n, ns, slba, nlb);
@@ -1179,21 +1156,28 @@ static uint16_t nvme_write(NvmeCtrl *n, NvmeRequest 
*req)
 goto invalid;
 }
 
-status = nvme_map_dptr(n, data_size, req);
-if (status) {
-goto invalid;
-}
-
 data_offset = nvme_l2b(ns, slba);
 
-block_acct_start(blk_get_stats(blk), >acct, data_size,
- BLOCK_ACCT_WRITE);
-if (req->qsg.sg) {
-req->aiocb = dma_blk_write(blk, >qsg, data_offset,
-   BDRV_SECTOR_SIZE, nvme_rw_cb, req);
+if (!wrz) {
+status = nvme_map_dptr(n, data_size, req);
+if (status) {
+goto invalid;
+}
+
+block_acct_start(blk_get_stats(blk), >acct, data_size,
+ BLOCK_ACCT_WRITE);
+if (req->qsg.sg) {
+req->aiocb = dma_blk_write(blk, >qsg, data_offset,
+   BDRV_SECTOR_SIZE, nvme_rw_cb, req);
+} else {
+req->aiocb = blk_aio_pwritev(blk, data_offset, >iov, 0,
+ nvme_rw_cb, req);
+}
 } else {
-req->aiocb = blk_aio_pwritev(blk, data_offset, >iov, 0,
- nvme_rw_cb, req);
+block_acct_start(blk_get_stats(blk), >acct, 0, BLOCK_ACCT_WRITE);
+req->aiocb = blk_aio_pwrite_zeroes(blk, data_offset, data_size,
+   BDRV_REQ_MAY_UNMAP, nvme_rw_cb,
+   req);
 }
 return NVME_NO_COMPLETE;
 
@@ -1202,6 +1186,16 @@ invalid:
 return status | NVME_DNR;
 }
 
+static inline uint16_t nvme_write(NvmeCtrl *n, NvmeRequest *req)
+{
+return nvme_do_write(n, req, false);
+}
+
+static inline uint16_t nvme_write_zeroes(NvmeCtrl *n, NvmeRequest *req)
+{
+return nvme_do_write(n, req, true);
+}
+
 static uint16_t nvme_io_cmd(NvmeCtrl *n, NvmeRequest *req)
 {
 uint32_t nsid = le32_to_cpu(req->cmd.nsid);
diff --git a/hw/block/trace-events b/hw/block/trace-events
index 6233f801e1..02a7c3044c 100644
--- a/hw/block/trace-events
+++ b/hw/block/trace-events
@@ -43,7 +43,6 @@ pci_nvme_admin_cmd(uint16_t cid, uint16_t sqid, uint8_t 
opcode, const char *opna
 pci_nvme_read(uint16_t cid, uint32_t nsid, uint32_t nlb, uint64_t count, 
uint64_t lba) "cid %"PRIu16" nsid %"PRIu32" nlb %"PRIu32" count %"PRIu64" lba 
0x%"PRIx64""
 pci_nvme_write(uint16_t cid, const char *verb, uint32_t nsid, uint32_t nlb, 
uint64_t count, uint64_t lba) "cid %"PRIu16" opname '%s' nsid %"PRIu32" nlb 
%"PRIu32" count %"PRIu64" lba 0x%"PRIx64""
 pci_nvme_rw_cb(uint16_t cid, const char *blkname) "cid %"PRIu16" blk '%s'"
-pci_nvme_write_zeroes(uint16_t

[PATCH v11 01/13] hw/block/nvme: Process controller reset and shutdown differently

2020-12-08 Thread Dmitry Fomichev

Controller reset ans subsystem shutdown are handled very much the same
in the current code, but some of the steps should be different in these
two cases.

Introduce two new functions, nvme_reset_ctrl() and nvme_shutdown_ctrl(),
to separate some portions of the code from nvme_clear_ctrl(). The steps
that are made different between reset and shutdown are that BAR.CC is not
reset to zero upon the shutdown and namespace data is flushed to
backing storage as a part of shutdown handling, but not upon reset.

Suggested-by: Klaus Jensen 
Signed-off-by: Dmitry Fomichev 
---
 hw/block/nvme-ns.h |  2 +-
 hw/block/nvme-ns.c |  2 +-
 hw/block/nvme.c| 24 ++--
 3 files changed, 20 insertions(+), 8 deletions(-)

diff --git a/hw/block/nvme-ns.h b/hw/block/nvme-ns.h
index 44bf6271b7..ed3d7e65d5 100644
--- a/hw/block/nvme-ns.h
+++ b/hw/block/nvme-ns.h
@@ -73,6 +73,6 @@ typedef struct NvmeCtrl NvmeCtrl;
 
 int nvme_ns_setup(NvmeCtrl *n, NvmeNamespace *ns, Error **errp);
 void nvme_ns_drain(NvmeNamespace *ns);
-void nvme_ns_flush(NvmeNamespace *ns);
+void nvme_ns_shutdown(NvmeNamespace *ns);
 
 #endif /* NVME_NS_H */
diff --git a/hw/block/nvme-ns.c b/hw/block/nvme-ns.c
index 847069a66e..9b95e2ed33 100644
--- a/hw/block/nvme-ns.c
+++ b/hw/block/nvme-ns.c
@@ -130,7 +130,7 @@ void nvme_ns_drain(NvmeNamespace *ns)
 blk_drain(ns->blkconf.blk);
 }
 
-void nvme_ns_flush(NvmeNamespace *ns)
+void nvme_ns_shutdown(NvmeNamespace *ns)
 {
 blk_flush(ns->blkconf.blk);
 }
diff --git a/hw/block/nvme.c b/hw/block/nvme.c
index 59990e00bc..10acb7e7f0 100644
--- a/hw/block/nvme.c
+++ b/hw/block/nvme.c
@@ -2197,6 +2197,20 @@ static void nvme_clear_ctrl(NvmeCtrl *n)
 n->aer_queued = 0;
 n->outstanding_aers = 0;
 n->qs_created = false;
+}
+
+static void nvme_ctrl_reset(NvmeCtrl *n)
+{
+nvme_clear_ctrl(n);
+n->bar.cc = 0;
+}
+
+static void nvme_ctrl_shutdown(NvmeCtrl *n)
+{
+NvmeNamespace *ns;
+int i;
+
+nvme_clear_ctrl(n);
 
 for (i = 1; i <= n->num_namespaces; i++) {
 ns = nvme_ns(n, i);
@@ -2204,10 +2218,8 @@ static void nvme_clear_ctrl(NvmeCtrl *n)
 continue;
 }
 
-nvme_ns_flush(ns);
+nvme_ns_shutdown(ns);
 }
-
-n->bar.cc = 0;
 }
 
 static int nvme_start_ctrl(NvmeCtrl *n)
@@ -2374,12 +2386,12 @@ static void nvme_write_bar(NvmeCtrl *n, hwaddr offset, 
uint64_t data,
 }
 } else if (!NVME_CC_EN(data) && NVME_CC_EN(n->bar.cc)) {
 trace_pci_nvme_mmio_stopped();
-nvme_clear_ctrl(n);
+nvme_ctrl_reset(n);
 n->bar.csts &= ~NVME_CSTS_READY;
 }
 if (NVME_CC_SHN(data) && !(NVME_CC_SHN(n->bar.cc))) {
 trace_pci_nvme_mmio_shutdown_set();
-nvme_clear_ctrl(n);
+nvme_ctrl_shutdown(n);
 n->bar.cc = data;
 n->bar.csts |= NVME_CSTS_SHST_COMPLETE;
 } else if (!NVME_CC_SHN(data) && NVME_CC_SHN(n->bar.cc)) {
@@ -2990,7 +3002,7 @@ static void nvme_exit(PCIDevice *pci_dev)
 {
 NvmeCtrl *n = NVME(pci_dev);
 
-nvme_clear_ctrl(n);
+nvme_ctrl_shutdown(n);
 g_free(n->cq);
 g_free(n->sq);
 g_free(n->aer_reqs);
-- 
2.28.0

Re: [PATCH v9 08/12] hw/block/nvme: Support Zoned Namespace Command Set

2020-12-08 Thread Dmitry Fomichev

Hi Klaus,

Thank you for your review! Please see replies below...


On Thu, 2020-11-12 at 20:36 +0100, Klaus Jensen wrote:
> Hi Dmitry,
> 
> I know you posted v10, but my comments should be relevant to that as
> well.
> 
> On Nov  5 11:53, Dmitry Fomichev wrote:
> > The emulation code has been changed to advertise NVM Command Set when
> > "zoned" device property is not set (default) and Zoned Namespace
> > Command Set otherwise.
> > 
> > Define values and structures that are needed to support Zoned
> > Namespace Command Set (NVMe TP 4053) in PCI NVMe controller emulator.
> > Define trace events where needed in newly introduced code.
> > 
> > In order to improve scalability, all open, closed and full zones
> > are organized in separate linked lists. Consequently, almost all
> > zone operations don't require scanning of the entire zone array
> > (which potentially can be quite large) - it is only necessary to
> > enumerate one or more zone lists.
> > 
> > Handlers for three new NVMe commands introduced in Zoned Namespace
> > Command Set specification are added, namely for Zone Management
> > Receive, Zone Management Send and Zone Append.
> > 
> > Device initialization code has been extended to create a proper
> > configuration for zoned operation using device properties.
> > 
> > Read/Write command handler is modified to only allow writes at the
> > write pointer if the namespace is zoned. For Zone Append command,
> > writes implicitly happen at the write pointer and the starting write
> > pointer value is returned as the result of the command. Write Zeroes
> > handler is modified to add zoned checks that are identical to those
> > done as a part of Write flow.
> > 
> > Subsequent commits in this series add ZDE support and checks for
> > active and open zone limits.
> > 
> > Signed-off-by: Niklas Cassel 
> > Signed-off-by: Hans Holmberg 
> > Signed-off-by: Ajay Joshi 
> > Signed-off-by: Chaitanya Kulkarni 
> > Signed-off-by: Matias Bjorling 
> > Signed-off-by: Aravind Ramesh 
> > Signed-off-by: Shin'ichiro Kawasaki 
> > Signed-off-by: Adam Manzanares 
> > Signed-off-by: Dmitry Fomichev 
> > Reviewed-by: Niklas Cassel 
> > ---
> >  hw/block/nvme-ns.h|  54 +++
> >  hw/block/nvme.h   |   8 +
> >  hw/block/nvme-ns.c| 173 
> >  hw/block/nvme.c   | 971 +-
> >  hw/block/trace-events |  18 +-
> >  5 files changed, 1209 insertions(+), 15 deletions(-)
> > 
> > diff --git a/hw/block/nvme-ns.h b/hw/block/nvme-ns.h
> > index 2d9cd29d07..d2631ff5a3 100644
> > --- a/hw/block/nvme-ns.h
> > +++ b/hw/block/nvme-ns.h
> > @@ -19,9 +19,20 @@
> >  #define NVME_NS(obj) \
> >  OBJECT_CHECK(NvmeNamespace, (obj), TYPE_NVME_NS)
> >  
> > +typedef struct NvmeZone {
> > +NvmeZoneDescr   d;
> > +uint64_tw_ptr;
> > +QTAILQ_ENTRY(NvmeZone) entry;
> > +} NvmeZone;
> > +
> >  typedef struct NvmeNamespaceParams {
> >  uint32_t nsid;
> >  QemuUUID uuid;
> > +
> > +bool zoned;
> > +bool cross_zone_read;
> > +uint64_t zone_size_bs;
> > +uint64_t zone_cap_bs;
> >  } NvmeNamespaceParams;
> >  
> >  typedef struct NvmeNamespace {
> > @@ -34,6 +45,18 @@ typedef struct NvmeNamespace {
> >  bool attached;
> >  uint8_t  csi;
> >  
> > +NvmeIdNsZoned   *id_ns_zoned;
> > +NvmeZone*zone_array;
> > +QTAILQ_HEAD(, NvmeZone) exp_open_zones;
> > +QTAILQ_HEAD(, NvmeZone) imp_open_zones;
> > +QTAILQ_HEAD(, NvmeZone) closed_zones;
> > +QTAILQ_HEAD(, NvmeZone) full_zones;
> > +uint32_tnum_zones;
> > +uint64_tzone_size;
> > +uint64_tzone_capacity;
> > +uint64_tzone_array_size;
> > +uint32_tzone_size_log2;
> > +
> >  NvmeNamespaceParams params;
> >  } NvmeNamespace;
> >  
> > @@ -71,8 +94,39 @@ static inline size_t nvme_l2b(NvmeNamespace *ns, 
> > uint64_t lba)
> >  
> >  typedef struct NvmeCtrl NvmeCtrl;
> >  
> > +static inline uint8_t nvme_get_zone_state(NvmeZone *zone)
> 
> This can (should?) return the NvmeZoneState enum.

Ok, good idea.

> 
> > +{
> > +return zone->d.zs >> 4;
> > +}
> > +
> > +static inline void nvme_set_zone_state(NvmeZone *zone, enum NvmeZoneState 
> > state)
> > +{
> > +zone->d.zs = state << 4;
> > +}
> > +
> > +static inline uint64_t nvme_zone_rd_boundary(NvmeNamespace *ns, NvmeZone 
> > *zone)
> > +{
> > +return zone->d.zslba + ns->zone_size;
> > +}
> > +
> > +static inline uint64_t nvme_zone_wr_boundary(NvmeZone *zone)
> > +{
> > +return zone->d.zslba + zone->d.zcap;
> > +}
> > +
> > +static inline bool nvme_wp_is_valid(NvmeZone *zone)
> > +{
> > +uint8_t st = nvme_get_zone_state(zone);
> > +
> > +return st != NVME_ZONE_STATE_FULL &&
> > +   st != NVME_ZONE_STATE_READ_ONLY &&
> > +   st != NVME_ZONE_STATE_OFFLINE;
> > +}
> > +
> >  int nvme_ns_setup(NvmeCtrl *n, NvmeNamespace *ns, Error **errp);
> >  void nvme_ns_drain(NvmeNamespace *ns);
> >  void

[PATCH v11 09/13] hw/block/nvme: Support Zoned Namespace Command Set

2020-12-08 Thread Dmitry Fomichev

The emulation code has been changed to advertise NVM Command Set when
"zoned" device property is not set (default) and Zoned Namespace
Command Set otherwise.

Define values and structures that are needed to support Zoned
Namespace Command Set (NVMe TP 4053) in PCI NVMe controller emulator.
Define trace events where needed in newly introduced code.

In order to improve scalability, all open, closed and full zones
are organized in separate linked lists. Consequently, almost all
zone operations don't require scanning of the entire zone array
(which potentially can be quite large) - it is only necessary to
enumerate one or more zone lists.

Handlers for three new NVMe commands introduced in Zoned Namespace
Command Set specification are added, namely for Zone Management
Receive, Zone Management Send and Zone Append.

Device initialization code has been extended to create a proper
configuration for zoned operation using device properties.

Read/Write command handler is modified to only allow writes at the
write pointer if the namespace is zoned. For Zone Append command,
writes implicitly happen at the write pointer and the starting write
pointer value is returned as the result of the command. Write Zeroes
handler is modified to add zoned checks that are identical to those
done as a part of Write flow.

Subsequent commits in this series add ZDE support and checks for
active and open zone limits.

Signed-off-by: Niklas Cassel 
Signed-off-by: Hans Holmberg 
Signed-off-by: Ajay Joshi 
Signed-off-by: Chaitanya Kulkarni 
Signed-off-by: Matias Bjorling 
Signed-off-by: Aravind Ramesh 
Signed-off-by: Shin'ichiro Kawasaki 
Signed-off-by: Adam Manzanares 
Signed-off-by: Dmitry Fomichev 
Reviewed-by: Niklas Cassel 
---
 hw/block/nvme-ns.h|  52 +++
 hw/block/nvme.h   |   6 +
 hw/block/nvme-ns.c| 165 +
 hw/block/nvme.c   | 804 +-
 hw/block/trace-events |  17 +
 5 files changed, 1036 insertions(+), 8 deletions(-)

diff --git a/hw/block/nvme-ns.h b/hw/block/nvme-ns.h
index bdbc98c2ec..388381dda0 100644
--- a/hw/block/nvme-ns.h
+++ b/hw/block/nvme-ns.h
@@ -19,9 +19,20 @@
 #define NVME_NS(obj) \
 OBJECT_CHECK(NvmeNamespace, (obj), TYPE_NVME_NS)
 
+typedef struct NvmeZone {
+NvmeZoneDescr   d;
+uint64_tw_ptr;
+QTAILQ_ENTRY(NvmeZone) entry;
+} NvmeZone;
+
 typedef struct NvmeNamespaceParams {
 uint32_t nsid;
 QemuUUID uuid;
+
+bool zoned;
+bool cross_zone_read;
+uint64_t zone_size_bs;
+uint64_t zone_cap_bs;
 } NvmeNamespaceParams;
 
 typedef struct NvmeNamespace {
@@ -33,6 +44,17 @@ typedef struct NvmeNamespace {
 const uint32_t *iocs;
 uint8_t  csi;
 
+NvmeIdNsZoned   *id_ns_zoned;
+NvmeZone*zone_array;
+QTAILQ_HEAD(, NvmeZone) exp_open_zones;
+QTAILQ_HEAD(, NvmeZone) imp_open_zones;
+QTAILQ_HEAD(, NvmeZone) closed_zones;
+QTAILQ_HEAD(, NvmeZone) full_zones;
+uint32_tnum_zones;
+uint64_tzone_size;
+uint64_tzone_capacity;
+uint32_tzone_size_log2;
+
 NvmeNamespaceParams params;
 
 struct {
@@ -74,8 +96,38 @@ static inline size_t nvme_l2b(NvmeNamespace *ns, uint64_t 
lba)
 
 typedef struct NvmeCtrl NvmeCtrl;
 
+static inline enum NvmeZoneState nvme_get_zone_state(NvmeZone *zone)
+{
+return zone->d.zs >> 4;
+}
+
+static inline void nvme_set_zone_state(NvmeZone *zone, enum NvmeZoneState 
state)
+{
+zone->d.zs = state << 4;
+}
+
+static inline uint64_t nvme_zone_rd_boundary(NvmeNamespace *ns, NvmeZone *zone)
+{
+return zone->d.zslba + ns->zone_size;
+}
+
+static inline uint64_t nvme_zone_wr_boundary(NvmeZone *zone)
+{
+return zone->d.zslba + zone->d.zcap;
+}
+
+static inline bool nvme_wp_is_valid(NvmeZone *zone)
+{
+uint8_t st = nvme_get_zone_state(zone);
+
+return st != NVME_ZONE_STATE_FULL &&
+   st != NVME_ZONE_STATE_READ_ONLY &&
+   st != NVME_ZONE_STATE_OFFLINE;
+}
+
 int nvme_ns_setup(NvmeCtrl *n, NvmeNamespace *ns, Error **errp);
 void nvme_ns_drain(NvmeNamespace *ns);
 void nvme_ns_shutdown(NvmeNamespace *ns);
+void nvme_ns_cleanup(NvmeNamespace *ns);
 
 #endif /* NVME_NS_H */
diff --git a/hw/block/nvme.h b/hw/block/nvme.h
index 574333caa3..b7fbcca39d 100644
--- a/hw/block/nvme.h
+++ b/hw/block/nvme.h
@@ -6,6 +6,9 @@
 
 #define NVME_MAX_NAMESPACES 256
 
+#define NVME_DEFAULT_ZONE_SIZE   (128 * MiB)
+#define NVME_DEFAULT_MAX_ZA_SIZE (128 * KiB)
+
 typedef struct NvmeParams {
 char *serial;
 uint32_t num_queues; /* deprecated since 5.1 */
@@ -16,6 +19,7 @@ typedef struct NvmeParams {
 uint32_t aer_max_queued;
 uint8_t  mdts;
 bool use_intel_id;
+uint32_t zasl_bs;
 } NvmeParams;
 
 typedef struct NvmeAsyncEvent {
@@ -149,6 +153,8 @@ typedef struct NvmeCtrl {
 QTAILQ_HEAD(, NvmeAsyncEvent) aer_queue;
 int aer_queued;
 
+uint8_t zasl;
+
 NvmeNamespace   namespace;
 NvmeNamespace   *namespaces[NVME_MAX_NAMESPACES];

[PATCH v11 07/13] hw/block/nvme: Support allocated CNS command variants

2020-12-08 Thread Dmitry Fomichev

From: Niklas Cassel 

Many CNS commands have "allocated" command variants. These include
a namespace as long as it is allocated, that is a namespace is
included regardless if it is active (attached) or not.

While these commands are optional (they are mandatory for controllers
supporting the namespace attachment command), our QEMU implementation
is more complete by actually providing support for these CNS values.

However, since our QEMU model currently does not support the namespace
attachment command, these new allocated CNS commands will return the
same result as the active CNS command variants.

The reason for not hooking up this command completely is because the
NVMe specification requires the namespace management command to be
supported if the namespace attachment command is supported.

Signed-off-by: Niklas Cassel 
Signed-off-by: Dmitry Fomichev 
Reviewed-by: Keith Busch 
---
 include/block/nvme.h | 20 
 hw/block/nvme.c  |  8 
 2 files changed, 20 insertions(+), 8 deletions(-)

diff --git a/include/block/nvme.h b/include/block/nvme.h
index 890977db4b..29d826ab19 100644
--- a/include/block/nvme.h
+++ b/include/block/nvme.h
@@ -807,14 +807,18 @@ typedef struct QEMU_PACKED NvmePSD {
 #define NVME_IDENTIFY_DATA_SIZE 4096
 
 enum NvmeIdCns {
-NVME_ID_CNS_NS= 0x00,
-NVME_ID_CNS_CTRL  = 0x01,
-NVME_ID_CNS_NS_ACTIVE_LIST= 0x02,
-NVME_ID_CNS_NS_DESCR_LIST = 0x03,
-NVME_ID_CNS_CS_NS = 0x05,
-NVME_ID_CNS_CS_CTRL   = 0x06,
-NVME_ID_CNS_CS_NS_ACTIVE_LIST = 0x07,
-NVME_ID_CNS_IO_COMMAND_SET= 0x1c,
+NVME_ID_CNS_NS= 0x00,
+NVME_ID_CNS_CTRL  = 0x01,
+NVME_ID_CNS_NS_ACTIVE_LIST= 0x02,
+NVME_ID_CNS_NS_DESCR_LIST = 0x03,
+NVME_ID_CNS_CS_NS = 0x05,
+NVME_ID_CNS_CS_CTRL   = 0x06,
+NVME_ID_CNS_CS_NS_ACTIVE_LIST = 0x07,
+NVME_ID_CNS_NS_PRESENT_LIST   = 0x10,
+NVME_ID_CNS_NS_PRESENT= 0x11,
+NVME_ID_CNS_CS_NS_PRESENT_LIST= 0x1a,
+NVME_ID_CNS_CS_NS_PRESENT = 0x1b,
+NVME_ID_CNS_IO_COMMAND_SET= 0x1c,
 };
 
 typedef struct QEMU_PACKED NvmeIdCtrl {
diff --git a/hw/block/nvme.c b/hw/block/nvme.c
index 16eed37533..7035896649 100644
--- a/hw/block/nvme.c
+++ b/hw/block/nvme.c
@@ -1901,16 +1901,24 @@ static uint16_t nvme_identify(NvmeCtrl *n, NvmeRequest 
*req)
 
 switch (le32_to_cpu(c->cns)) {
 case NVME_ID_CNS_NS:
+ /* fall through */
+case NVME_ID_CNS_NS_PRESENT:
 return nvme_identify_ns(n, req);
 case NVME_ID_CNS_CS_NS:
+ /* fall through */
+case NVME_ID_CNS_CS_NS_PRESENT:
 return nvme_identify_ns_csi(n, req);
 case NVME_ID_CNS_CTRL:
 return nvme_identify_ctrl(n, req);
 case NVME_ID_CNS_CS_CTRL:
 return nvme_identify_ctrl_csi(n, req);
 case NVME_ID_CNS_NS_ACTIVE_LIST:
+ /* fall through */
+case NVME_ID_CNS_NS_PRESENT_LIST:
 return nvme_identify_nslist(n, req);
 case NVME_ID_CNS_CS_NS_ACTIVE_LIST:
+ /* fall through */
+case NVME_ID_CNS_CS_NS_PRESENT_LIST:
 return nvme_identify_nslist_csi(n, req);
 case NVME_ID_CNS_NS_DESCR_LIST:
 return nvme_identify_ns_descr_list(n, req);
-- 
2.28.0

[PATCH v11 03/13] hw/block/nvme: Separate read and write handlers

2020-12-08 Thread Dmitry Fomichev

The majority of code in nvme_rw() is becoming read- or write-specific.
Move these parts to two separate handlers, nvme_read() and nvme_write()
to make the code more readable and to remove multiple is_write checks
that has been present in the i/o path.

This is a refactoring patch, no change in functionality.

Signed-off-by: Dmitry Fomichev 
Reviewed-by: Niklas Cassel 
Acked-by: Klaus Jensen 
---
 hw/block/nvme.c   | 107 --
 hw/block/trace-events |   3 +-
 2 files changed, 74 insertions(+), 36 deletions(-)

diff --git a/hw/block/nvme.c b/hw/block/nvme.c
index a30fe75620..897c2d04e5 100644
--- a/hw/block/nvme.c
+++ b/hw/block/nvme.c
@@ -1073,6 +1073,61 @@ static uint16_t nvme_flush(NvmeCtrl *n, NvmeRequest *req)
 return NVME_NO_COMPLETE;
 }
 
+static uint16_t nvme_read(NvmeCtrl *n, NvmeRequest *req)
+{
+NvmeRwCmd *rw = (NvmeRwCmd *)>cmd;
+NvmeNamespace *ns = req->ns;
+uint64_t slba = le64_to_cpu(rw->slba);
+uint32_t nlb = (uint32_t)le16_to_cpu(rw->nlb) + 1;
+uint64_t data_size = nvme_l2b(ns, nlb);
+uint64_t data_offset;
+BlockBackend *blk = ns->blkconf.blk;
+uint16_t status;
+
+trace_pci_nvme_read(nvme_cid(req), nvme_nsid(ns), nlb, data_size, slba);
+
+status = nvme_check_mdts(n, data_size);
+if (status) {
+trace_pci_nvme_err_mdts(nvme_cid(req), data_size);
+goto invalid;
+}
+
+status = nvme_check_bounds(n, ns, slba, nlb);
+if (status) {
+trace_pci_nvme_err_invalid_lba_range(slba, nlb, ns->id_ns.nsze);
+goto invalid;
+}
+
+status = nvme_map_dptr(n, data_size, req);
+if (status) {
+goto invalid;
+}
+
+if (NVME_ERR_REC_DULBE(ns->features.err_rec)) {
+status = nvme_check_dulbe(ns, slba, nlb);
+if (status) {
+goto invalid;
+}
+}
+
+data_offset = nvme_l2b(ns, slba);
+
+block_acct_start(blk_get_stats(blk), >acct, data_size,
+ BLOCK_ACCT_READ);
+if (req->qsg.sg) {
+req->aiocb = dma_blk_read(blk, >qsg, data_offset,
+  BDRV_SECTOR_SIZE, nvme_rw_cb, req);
+} else {
+req->aiocb = blk_aio_preadv(blk, data_offset, >iov, 0,
+nvme_rw_cb, req);
+}
+return NVME_NO_COMPLETE;
+
+invalid:
+block_acct_invalid(blk_get_stats(blk), BLOCK_ACCT_READ);
+return status | NVME_DNR;
+}
+
 static uint16_t nvme_write_zeroes(NvmeCtrl *n, NvmeRequest *req)
 {
 NvmeRwCmd *rw = (NvmeRwCmd *)>cmd;
@@ -1098,22 +1153,19 @@ static uint16_t nvme_write_zeroes(NvmeCtrl *n, 
NvmeRequest *req)
 return NVME_NO_COMPLETE;
 }
 
-static uint16_t nvme_rw(NvmeCtrl *n, NvmeRequest *req)
+static uint16_t nvme_write(NvmeCtrl *n, NvmeRequest *req)
 {
 NvmeRwCmd *rw = (NvmeRwCmd *)>cmd;
 NvmeNamespace *ns = req->ns;
-uint32_t nlb = (uint32_t)le16_to_cpu(rw->nlb) + 1;
 uint64_t slba = le64_to_cpu(rw->slba);
-
+uint32_t nlb = (uint32_t)le16_to_cpu(rw->nlb) + 1;
 uint64_t data_size = nvme_l2b(ns, nlb);
-uint64_t data_offset = nvme_l2b(ns, slba);
-enum BlockAcctType acct = req->cmd.opcode == NVME_CMD_WRITE ?
-BLOCK_ACCT_WRITE : BLOCK_ACCT_READ;
+uint64_t data_offset;
 BlockBackend *blk = ns->blkconf.blk;
 uint16_t status;
 
-trace_pci_nvme_rw(nvme_cid(req), nvme_io_opc_str(rw->opcode),
-  nvme_nsid(ns), nlb, data_size, slba);
+trace_pci_nvme_write(nvme_cid(req), nvme_io_opc_str(rw->opcode),
+ nvme_nsid(ns), nlb, data_size, slba);
 
 status = nvme_check_mdts(n, data_size);
 if (status) {
@@ -1127,43 +1179,27 @@ static uint16_t nvme_rw(NvmeCtrl *n, NvmeRequest *req)
 goto invalid;
 }
 
-if (acct == BLOCK_ACCT_READ) {
-if (NVME_ERR_REC_DULBE(ns->features.err_rec)) {
-status = nvme_check_dulbe(ns, slba, nlb);
-if (status) {
-goto invalid;
-}
-}
-}
-
 status = nvme_map_dptr(n, data_size, req);
 if (status) {
 goto invalid;
 }
 
-block_acct_start(blk_get_stats(blk), >acct, data_size, acct);
+data_offset = nvme_l2b(ns, slba);
+
+block_acct_start(blk_get_stats(blk), >acct, data_size,
+ BLOCK_ACCT_WRITE);
 if (req->qsg.sg) {
-if (acct == BLOCK_ACCT_WRITE) {
-req->aiocb = dma_blk_write(blk, >qsg, data_offset,
-   BDRV_SECTOR_SIZE, nvme_rw_cb, req);
-} else {
-req->aiocb = dma_blk_read(blk, >qsg, data_offset,
-  BDRV_SECTOR_SIZE, nvme_rw_cb, req);
-}
+req->aiocb = dma_blk_write(blk, >qsg, data_offset,
+   BDRV_SECTOR_SIZE, nvme_rw_cb, req);
 } else {
-if (acct == BLOCK_ACCT_WRITE) {
-req->aiocb = blk_aio_pwritev(blk, data_offset, >iov, 0,
- nvme_rw_cb, req);
-

[PATCH v11 02/13] hw/block/nvme: Generate namespace UUIDs

2020-12-08 Thread Dmitry Fomichev

In NVMe 1.4, a namespace must report an ID descriptor of UUID type
if it doesn't support EUI64 or NGUID. Add a new namespace property,
"uuid", that provides the user the option to either specify the UUID
explicitly or have a UUID generated automatically every time a
namespace is initialized.

Suggested-by: Klaus Jensen 
Signed-off-by: Dmitry Fomichev 
Reviewed-by: Klaus Jensen 
Reviewed-by: Keith Busch 
Reviewed-by: Niklas Cassel 
---
 hw/block/nvme-ns.h | 1 +
 hw/block/nvme-ns.c | 1 +
 hw/block/nvme.c| 9 +
 3 files changed, 7 insertions(+), 4 deletions(-)

diff --git a/hw/block/nvme-ns.h b/hw/block/nvme-ns.h
index ed3d7e65d5..aeca810fc7 100644
--- a/hw/block/nvme-ns.h
+++ b/hw/block/nvme-ns.h
@@ -21,6 +21,7 @@
 
 typedef struct NvmeNamespaceParams {
 uint32_t nsid;
+QemuUUID uuid;
 } NvmeNamespaceParams;
 
 typedef struct NvmeNamespace {
diff --git a/hw/block/nvme-ns.c b/hw/block/nvme-ns.c
index 9b95e2ed33..6349aa30be 100644
--- a/hw/block/nvme-ns.c
+++ b/hw/block/nvme-ns.c
@@ -152,6 +152,7 @@ static void nvme_ns_realize(DeviceState *dev, Error **errp)
 static Property nvme_ns_props[] = {
 DEFINE_BLOCK_PROPERTIES(NvmeNamespace, blkconf),
 DEFINE_PROP_UINT32("nsid", NvmeNamespace, params.nsid, 0),
+DEFINE_PROP_UUID("uuid", NvmeNamespace, params.uuid),
 DEFINE_PROP_END_OF_LIST(),
 };
 
diff --git a/hw/block/nvme.c b/hw/block/nvme.c
index 10acb7e7f0..a30fe75620 100644
--- a/hw/block/nvme.c
+++ b/hw/block/nvme.c
@@ -1662,6 +1662,7 @@ static uint16_t nvme_identify_nslist(NvmeCtrl *n, 
NvmeRequest *req)
 
 static uint16_t nvme_identify_ns_descr_list(NvmeCtrl *n, NvmeRequest *req)
 {
+NvmeNamespace *ns;
 NvmeIdentify *c = (NvmeIdentify *)>cmd;
 uint32_t nsid = le32_to_cpu(c->nsid);
 uint8_t list[NVME_IDENTIFY_DATA_SIZE];
@@ -1681,7 +1682,8 @@ static uint16_t nvme_identify_ns_descr_list(NvmeCtrl *n, 
NvmeRequest *req)
 return NVME_INVALID_NSID | NVME_DNR;
 }
 
-if (unlikely(!nvme_ns(n, nsid))) {
+ns = nvme_ns(n, nsid);
+if (unlikely(!ns)) {
 return NVME_INVALID_FIELD | NVME_DNR;
 }
 
@@ -1690,12 +1692,11 @@ static uint16_t nvme_identify_ns_descr_list(NvmeCtrl 
*n, NvmeRequest *req)
 /*
  * Because the NGUID and EUI64 fields are 0 in the Identify Namespace data
  * structure, a Namespace UUID (nidt = 0x3) must be reported in the
- * Namespace Identification Descriptor. Add a very basic Namespace UUID
- * here.
+ * Namespace Identification Descriptor. Add the namespace UUID here.
  */
 ns_descrs->uuid.hdr.nidt = NVME_NIDT_UUID;
 ns_descrs->uuid.hdr.nidl = NVME_NIDT_UUID_LEN;
-stl_be_p(_descrs->uuid.v, nsid);
+memcpy(_descrs->uuid.v, ns->params.uuid.data, NVME_NIDT_UUID_LEN);
 
 return nvme_dma(n, list, NVME_IDENTIFY_DATA_SIZE,
 DMA_DIRECTION_FROM_DEVICE, req);
-- 
2.28.0

[PULL 48/66] libvhost-user: make it a meson subproject

2020-12-08 Thread Michael S. Tsirkin

From: Marc-André Lureau 

By making libvhost-user a subproject, check it builds
standalone (without the global QEMU cflags etc).

Note that the library still relies on QEMU include/qemu/atomic.h and
linux_headers/.

Signed-off-by: Marc-André Lureau 
Message-Id: <20201125100640.366523-6-marcandre.lur...@redhat.com>
Reviewed-by: Michael S. Tsirkin 
Signed-off-by: Michael S. Tsirkin 
---
 contrib/vhost-user-gpu/vugpu.h|  2 +-
 include/qemu/vhost-user-server.h  |  2 +-
 .../libvhost-user/libvhost-user-glib.h|  0
 .../libvhost-user/libvhost-user.h |  0
 block/export/vhost-user-blk-server.c  |  2 +-
 contrib/vhost-user-blk/vhost-user-blk.c   |  3 +--
 contrib/vhost-user-input/main.c   |  3 +--
 contrib/vhost-user-scsi/vhost-user-scsi.c |  2 +-
 .../libvhost-user/libvhost-user-glib.c|  0
 .../libvhost-user/libvhost-user.c |  0
 tests/vhost-user-bridge.c |  2 +-
 tools/virtiofsd/fuse_virtio.c |  2 +-
 contrib/libvhost-user/meson.build |  4 
 contrib/vhost-user-blk/meson.build|  3 +--
 contrib/vhost-user-gpu/meson.build|  3 +--
 contrib/vhost-user-input/meson.build  |  3 +--
 contrib/vhost-user-scsi/meson.build   |  3 +--
 meson.build   |  7 ++-
 subprojects/libvhost-user/meson.build | 20 +++
 tests/meson.build |  3 +--
 tools/virtiofsd/meson.build   |  3 +--
 21 files changed, 40 insertions(+), 27 deletions(-)
 rename {contrib => subprojects}/libvhost-user/libvhost-user-glib.h (100%)
 rename {contrib => subprojects}/libvhost-user/libvhost-user.h (100%)
 rename {contrib => subprojects}/libvhost-user/libvhost-user-glib.c (100%)
 rename {contrib => subprojects}/libvhost-user/libvhost-user.c (100%)
 delete mode 100644 contrib/libvhost-user/meson.build
 create mode 100644 subprojects/libvhost-user/meson.build

diff --git a/contrib/vhost-user-gpu/vugpu.h b/contrib/vhost-user-gpu/vugpu.h
index 3153c9a6de..bdf9a74b46 100644
--- a/contrib/vhost-user-gpu/vugpu.h
+++ b/contrib/vhost-user-gpu/vugpu.h
@@ -17,7 +17,7 @@
 
 #include "qemu/osdep.h"
 
-#include "contrib/libvhost-user/libvhost-user-glib.h"
+#include "libvhost-user-glib.h"
 #include "standard-headers/linux/virtio_gpu.h"
 
 #include "qemu/queue.h"
diff --git a/include/qemu/vhost-user-server.h b/include/qemu/vhost-user-server.h
index 0da4c2cc4c..121ea1dedf 100644
--- a/include/qemu/vhost-user-server.h
+++ b/include/qemu/vhost-user-server.h
@@ -11,7 +11,7 @@
 #ifndef VHOST_USER_SERVER_H
 #define VHOST_USER_SERVER_H
 
-#include "contrib/libvhost-user/libvhost-user.h"
+#include "subprojects/libvhost-user/libvhost-user.h" /* only for the type 
definitions */
 #include "io/channel-socket.h"
 #include "io/channel-file.h"
 #include "io/net-listener.h"
diff --git a/contrib/libvhost-user/libvhost-user-glib.h 
b/subprojects/libvhost-user/libvhost-user-glib.h
similarity index 100%
rename from contrib/libvhost-user/libvhost-user-glib.h
rename to subprojects/libvhost-user/libvhost-user-glib.h
diff --git a/contrib/libvhost-user/libvhost-user.h 
b/subprojects/libvhost-user/libvhost-user.h
similarity index 100%
rename from contrib/libvhost-user/libvhost-user.h
rename to subprojects/libvhost-user/libvhost-user.h
diff --git a/block/export/vhost-user-blk-server.c 
b/block/export/vhost-user-blk-server.c
index 62672d1cb9..a3d95ca012 100644
--- a/block/export/vhost-user-blk-server.c
+++ b/block/export/vhost-user-blk-server.c
@@ -11,7 +11,7 @@
  */
 #include "qemu/osdep.h"
 #include "block/block.h"
-#include "contrib/libvhost-user/libvhost-user.h"
+#include "subprojects/libvhost-user/libvhost-user.h" /* only for the type 
definitions */
 #include "standard-headers/linux/virtio_blk.h"
 #include "qemu/vhost-user-server.h"
 #include "vhost-user-blk-server.h"
diff --git a/contrib/vhost-user-blk/vhost-user-blk.c 
b/contrib/vhost-user-blk/vhost-user-blk.c
index dc981bf945..6abd7835a8 100644
--- a/contrib/vhost-user-blk/vhost-user-blk.c
+++ b/contrib/vhost-user-blk/vhost-user-blk.c
@@ -17,8 +17,7 @@
 
 #include "qemu/osdep.h"
 #include "standard-headers/linux/virtio_blk.h"
-#include "contrib/libvhost-user/libvhost-user-glib.h"
-#include "contrib/libvhost-user/libvhost-user.h"
+#include "libvhost-user-glib.h"
 
 #if defined(__linux__)
 #include 
diff --git a/contrib/vhost-user-input/main.c b/contrib/vhost-user-input/main.c
index 6020c6f33a..3ea840cf44 100644
--- a/contrib/vhost-user-input/main.c
+++ b/contrib/vhost-user-input/main.c
@@ -12,8 +12,7 @@
 #include "qemu/iov.h"
 #include "qemu/bswap.h"
 #include "qemu/sockets.h"
-#include "contrib/libvhost-user/libvhost-user.h"
-#include "contrib/libvhost-user/libvhost-user-glib.h"
+#include "libvhost-user-glib.h"
 #include "standard-headers/linux/virtio_input.h"
 #include "qapi/error.h"
 
diff --git a/contrib/vhost-user-scsi/vhost-user-scsi.c

[PULL 55/66] block/export: avoid g_return_val_if() input validation

2020-12-08 Thread Michael S. Tsirkin

From: Stefan Hajnoczi 

Do not validate input with g_return_val_if(). This API is intended for
checking programming errors and is compiled out with -DG_DISABLE_CHECKS.

Use an explicit if statement for input validation so it cannot
accidentally be compiled out.

Suggested-by: Markus Armbruster 
Signed-off-by: Stefan Hajnoczi 
Message-Id: <20201118091644.199527-5-stefa...@redhat.com>
Reviewed-by: Michael S. Tsirkin 
Signed-off-by: Michael S. Tsirkin 
---
 block/export/vhost-user-blk-server.c | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/block/export/vhost-user-blk-server.c 
b/block/export/vhost-user-blk-server.c
index a3d95ca012..ab2c4d44c4 100644
--- a/block/export/vhost-user-blk-server.c
+++ b/block/export/vhost-user-blk-server.c
@@ -267,7 +267,9 @@ vu_blk_get_config(VuDev *vu_dev, uint8_t *config, uint32_t 
len)
 VuServer *server = container_of(vu_dev, VuServer, vu_dev);
 VuBlkExport *vexp = container_of(server, VuBlkExport, vu_server);
 
-g_return_val_if_fail(len <= sizeof(struct virtio_blk_config), -1);
+if (len > sizeof(struct virtio_blk_config)) {
+return -1;
+}
 
 memcpy(config, >blkcfg, len);
 return 0;
-- 
MST

Re: [PATCH 3/3] block: Fix deadlock in bdrv_co_yield_to_drain()

2020-12-08 Thread Kevin Wolf

Am 08.12.2020 um 16:33 hat Vladimir Sementsov-Ogievskiy geschrieben:
> 03.12.2020 20:23, Kevin Wolf wrote:
> > If bdrv_co_yield_to_drain() is called for draining a block node that
> > runs in a different AioContext, it keeps that AioContext locked while it
> > yields and schedules a BH in the AioContext to do the actual drain.
> > 
> > As long as executing the BH is the very next thing the event loop of the
> 
> s/thing the event/thing in the event/
> 
> (I've reread several times to understand :)

Oops, thanks.

"...the next thing that the event loop _does_" is actually what I had in
mind.

> > node's AioContext, this actually happens to work, but when it tries to
> > execute something else that wants to take the AioContext lock, it will
> > deadlock. (In the bug report, this other thing is a virtio-scsi device
> > running virtio_scsi_data_plane_handle_cmd().)
> > 
> > Instead, always drop the AioContext lock across the yield and reacquire
> > it only when the coroutine is reentered. The BH needs to unconditionally
> > take the lock for itself now.
> > 
> > This fixes the 'block_resize' QMP command on a block node that runs in
> > an iothread.
> > 
> > Cc: qemu-sta...@nongnu.org
> > Fixes: eb94b81a94bce112e6b206df846c1551aaf6cab6
> > Fixes: https://bugzilla.redhat.com/show_bug.cgi?id=1903511
> > Signed-off-by: Kevin Wolf 
> 
> I don't feel myself good enough in aio contexts acquiring and
> switching, to see any side effects. At least I don't see any obvious
> mistakes, so my weak:
> 
> Reviewed-by: Vladimir Sementsov-Ogievskiy 

> Note, I looked through the callers:
> 
> bdrv_do_drained_begin/end should be ok, as their normal usage is to
> start/end drained section under acquired aio context, so it seems
> correct to temporary release the context. Still I didn't check all
> drained sections in the code.
> 
> bdrv_drain_all_begin seems OK too (we just wait until everything is
> drained, not bad to temporary release the lock). Still I don't see any
> call of it from coroutine context.

The good thing there is that BDRV_POLL_WHILE() drops the lock anyway, so
at least for all callers of bdrv_do_drained_begin() that pass poll=true,
we know that they are fine with releasing the lock temporarily.

There are two callers that pass false: The recursive call inside the
function itself, and bdrv_drain_all_begin(). We know that both will poll
later, so they always release the lock at least once.

For ending the drain section, there is bdrv_drained_end_no_poll(), which
is only called in bdrv_child_cb_drained_end(), i.e. an implementation of
BdrvChildClass.drained_end. This is only called recursively in the
context of a polling drain_end, which already drops the lock.

So I think we don't introduce any cases of dropping the lock where this
wouldn't have happened before.

Kevin

Re: [PATCH 0/4] block: prepare for 64bit

2020-12-08 Thread Vladimir Sementsov-Ogievskiy


08.12.2020 20:13, Kevin Wolf wrote:

Am 03.12.2020 um 23:27 hat Vladimir Sementsov-Ogievskiy geschrieben:

Hi all!

This is a preparation series for v4 of "[PATCH v3 00/17] 64bit
block-layer".

The whole thing is in 04, and 01-03 are small preparations.


Thanks, applied to the block branch.



Thank you!

--
Best regards,
Vladimir

Re: [PATCH 0/4] block: prepare for 64bit

2020-12-08 Thread Kevin Wolf

Am 03.12.2020 um 23:27 hat Vladimir Sementsov-Ogievskiy geschrieben:
> Hi all!
> 
> This is a preparation series for v4 of "[PATCH v3 00/17] 64bit
> block-layer".
> 
> The whole thing is in 04, and 01-03 are small preparations.

Thanks, applied to the block branch.

Kevin

Re: [PATCH 1/3] block: Simplify qmp_block_resize() error paths

2020-12-08 Thread Kevin Wolf

Am 08.12.2020 um 15:15 hat Vladimir Sementsov-Ogievskiy geschrieben:
> 03.12.2020 20:23, Kevin Wolf wrote:
> > The only thing that happens after the 'out:' label is blk_unref(blk).
> > However, blk = NULL in all of the error cases, so instead of jumping to
> > 'out:', we can just return directly.
> > 
> > Cc: qemu-sta...@nongnu.org
> > Signed-off-by: Kevin Wolf 
> > ---
> >   blockdev.c | 7 +++
> >   1 file changed, 3 insertions(+), 4 deletions(-)
> > 
> > diff --git a/blockdev.c b/blockdev.c
> > index fe6fb5dc1d..229d2cce1b 100644
> > --- a/blockdev.c
> > +++ b/blockdev.c
> > @@ -2468,17 +2468,17 @@ void coroutine_fn qmp_block_resize(bool has_device, 
> > const char *device,
> >   if (size < 0) {
> >   error_setg(errp, QERR_INVALID_PARAMETER_VALUE, "size", "a >0 
> > size");
> > -goto out;
> > +return;
> >   }
> >   if (bdrv_op_is_blocked(bs, BLOCK_OP_TYPE_RESIZE, NULL)) {
> >   error_setg(errp, QERR_DEVICE_IN_USE, device);
> > -goto out;
> > +return;
> >   }
> >   blk = blk_new_with_bs(bs, BLK_PERM_RESIZE, BLK_PERM_ALL, errp);
> >   if (!blk) {
> > -goto out;
> > +return;
> >   }
> >   bdrv_drained_begin(bs);
> > @@ -2487,7 +2487,6 @@ void coroutine_fn qmp_block_resize(bool has_device, 
> > const char *device,
> >   bdrv_co_leave(bs, old_ctx);
> >   bdrv_drained_end(bs);
> > -out:
> >   bdrv_co_lock(bs);
> >   blk_unref(blk);
> >   bdrv_co_unlock(bs);
> > 
> 
> Initialization of blk to NULL becomes redundant with this patch, so
> may be dropped too.

Good catch, I'll change this while applying.

Kevin

Re: [PATCH v3 2/2] block: qcow2: remove the created file on initialization error

2020-12-08 Thread Maxim Levitsky

On Tue, 2020-12-08 at 19:54 +0300, Vladimir Sementsov-Ogievskiy wrote:
> 08.12.2020 19:27, Maxim Levitsky wrote:
> > On Tue, 2020-12-08 at 18:47 +0300, Vladimir Sementsov-Ogievskiy wrote:
> > > 08.12.2020 17:21, Maxim Levitsky wrote:
> > > > If the qcow initialization fails, we should remove the file if it was
> > > > already created, to avoid leaving stale files around.
> > > > 
> > > > We already do this for luks raw images.
> > > > 
> > > > Signed-off-by: Maxim Levitsky 
> > > > ---
> > > >block/qcow2.c | 13 +
> > > >1 file changed, 13 insertions(+)
> > > > 
> > > > diff --git a/block/qcow2.c b/block/qcow2.c
> > > > index 3a90ef2786..3bc2096b72 100644
> > > > --- a/block/qcow2.c
> > > > +++ b/block/qcow2.c
> > > > @@ -3848,6 +3848,19 @@ static int coroutine_fn 
> > > > qcow2_co_create_opts(BlockDriver *drv,
> > > >/* Create the qcow2 image (format layer) */
> > > >ret = qcow2_co_create(create_options, errp);
> > > >if (ret < 0) {
> > > > +
> > > > +Error *local_delete_err = NULL;
> > > > +int r_del = bdrv_co_delete_file(bs, _delete_err);
> > > > +/*
> > > > + * ENOTSUP will happen if the block driver doesn't support
> > > > + * the 'bdrv_co_delete_file' interface. This is a predictable
> > > > + * scenario and shouldn't be reported back to the user.
> > > > + */
> > > > +if ((r_del < 0) && (r_del != -ENOTSUP)) {
> > > > +error_report_err(local_delete_err);
> > > > +} else {
> > > > +error_free(local_delete_err);
> > > > +}
> > > >goto finish;
> > > >}
> > > >
> > > > 
> > > 
> > > Hi!
> > > 
> > > As I understand, qcow2_co_create is a new interface and 
> > > qcow2_co_create_opts() is old, and now works as a wrapper on 
> > > qcow2_co_create.
> > > 
> > > I think it's better to do the cleanup in qcow2_co_create, to bring the 
> > > feature both to new and old interface in the same way.
> > 
> > I think that the new interface doesn't need this fix, since
> > using the new interface is only possible from qmp which
> > forces the user to explicitly create and open the file
> > prior to formatting it with qcow2 format.
> > 
> 
> Oh yes, you are right. File is created by bdrv_create_file() in 
> qcow2_co_create_opts() not in qcow2_co_create(). Still, I think, you should 
> remove the file on any failure after bdrv_create_file() call, but you remove 
> it only on the last failure point..

You are right! The bulk of the code that can fail is in qcow2_co_create_opts 
but there 
are indeed few error conditions prior to that.

Thanks for pointing that out.
I'll fix that.

Best regards,
Maxim Levitsky


> 
>

Re: [PATCH v3 2/2] block: qcow2: remove the created file on initialization error

2020-12-08 Thread Vladimir Sementsov-Ogievskiy


08.12.2020 19:27, Maxim Levitsky wrote:

On Tue, 2020-12-08 at 18:47 +0300, Vladimir Sementsov-Ogievskiy wrote:

08.12.2020 17:21, Maxim Levitsky wrote:

If the qcow initialization fails, we should remove the file if it was
already created, to avoid leaving stale files around.

We already do this for luks raw images.

Signed-off-by: Maxim Levitsky 
---
   block/qcow2.c | 13 +
   1 file changed, 13 insertions(+)

diff --git a/block/qcow2.c b/block/qcow2.c
index 3a90ef2786..3bc2096b72 100644
--- a/block/qcow2.c
+++ b/block/qcow2.c
@@ -3848,6 +3848,19 @@ static int coroutine_fn qcow2_co_create_opts(BlockDriver 
*drv,
   /* Create the qcow2 image (format layer) */
   ret = qcow2_co_create(create_options, errp);
   if (ret < 0) {
+
+Error *local_delete_err = NULL;
+int r_del = bdrv_co_delete_file(bs, _delete_err);
+/*
+ * ENOTSUP will happen if the block driver doesn't support
+ * the 'bdrv_co_delete_file' interface. This is a predictable
+ * scenario and shouldn't be reported back to the user.
+ */
+if ((r_del < 0) && (r_del != -ENOTSUP)) {
+error_report_err(local_delete_err);
+} else {
+error_free(local_delete_err);
+}
   goto finish;
   }
   



Hi!

As I understand, qcow2_co_create is a new interface and qcow2_co_create_opts() 
is old, and now works as a wrapper on qcow2_co_create.

I think it's better to do the cleanup in qcow2_co_create, to bring the feature 
both to new and old interface in the same way.


I think that the new interface doesn't need this fix, since
using the new interface is only possible from qmp which
forces the user to explicitly create and open the file
prior to formatting it with qcow2 format.



Oh yes, you are right. File is created by bdrv_create_file() in 
qcow2_co_create_opts() not in qcow2_co_create(). Still, I think, you should 
remove the file on any failure after bdrv_create_file() call, but you remove it 
only on the last failure point..


--
Best regards,
Vladimir

Re: [PATCH 2/3] block: Fix locking in qmp_block_resize()

2020-12-08 Thread Kevin Wolf

Am 08.12.2020 um 15:46 hat Vladimir Sementsov-Ogievskiy geschrieben:
> 03.12.2020 20:23, Kevin Wolf wrote:
> > The drain functions assume that we hold the AioContext lock of the
> > drained block node. Make sure to actually take the lock.
> > 
> > Cc: qemu-sta...@nongnu.org
> > Fixes: eb94b81a94bce112e6b206df846c1551aaf6cab6
> > Signed-off-by: Kevin Wolf 
> > ---
> >   blockdev.c | 5 -
> >   1 file changed, 4 insertions(+), 1 deletion(-)
> > 
> > diff --git a/blockdev.c b/blockdev.c
> > index 229d2cce1b..0535a8dc9e 100644
> > --- a/blockdev.c
> > +++ b/blockdev.c
> > @@ -2481,13 +2481,16 @@ void coroutine_fn qmp_block_resize(bool has_device, 
> > const char *device,
> >   return;
> >   }
> > +bdrv_co_lock(bs);
> >   bdrv_drained_begin(bs);
> > +bdrv_co_unlock(bs);
> > +
> >   old_ctx = bdrv_co_enter(bs);
> >   blk_truncate(blk, size, false, PREALLOC_MODE_OFF, 0, errp);
> >   bdrv_co_leave(bs, old_ctx);
> > -bdrv_drained_end(bs);
> >   bdrv_co_lock(bs);
> > +bdrv_drained_end(bs);
> >   blk_unref(blk);
> >   bdrv_co_unlock(bs);
> >   }
> > 
> 
> Can't we just do
> 
> old_ctx = bdrv_co_enter(bs);
> 
> bdrv_drained_begin(bs);
> blk_truncate(blk, size, false, PREALLOC_MODE_OFF, 0, errp);
> bdrv_drained_end(bs);
> blk_unref(blk);
> 
> bdrv_co_leave(bs, old_ctx);
> 
> 
> ? This way we have one acquire/release section instead of three in a
> row.. But then we probably need addition bdrv_ref/bdrv_unref, to not
> crash with final bdrv_co_leave after blk_unref.

That was my first attempt, but bdrv_co_enter()/leave() increase
bs->in_flight, so the drain would deadlock.

> Also, preexisting, but it seems not good that coroutine_fn
> qmp_block_resize is called from non-coroutine hmp_block_resize()

hmp_block_resize() is actually in coroutine context, commit eb94b81a
only forgot to add a coroutine_fn marker to it.

> anyway:
> Reviewed-by: Vladimir Sementsov-Ogievskiy 

Thanks!

Kevin

Re: [PATCH v3 2/2] block: qcow2: remove the created file on initialization error

2020-12-08 Thread Maxim Levitsky

On Tue, 2020-12-08 at 18:47 +0300, Vladimir Sementsov-Ogievskiy wrote:
> 08.12.2020 17:21, Maxim Levitsky wrote:
> > If the qcow initialization fails, we should remove the file if it was
> > already created, to avoid leaving stale files around.
> > 
> > We already do this for luks raw images.
> > 
> > Signed-off-by: Maxim Levitsky 
> > ---
> >   block/qcow2.c | 13 +
> >   1 file changed, 13 insertions(+)
> > 
> > diff --git a/block/qcow2.c b/block/qcow2.c
> > index 3a90ef2786..3bc2096b72 100644
> > --- a/block/qcow2.c
> > +++ b/block/qcow2.c
> > @@ -3848,6 +3848,19 @@ static int coroutine_fn 
> > qcow2_co_create_opts(BlockDriver *drv,
> >   /* Create the qcow2 image (format layer) */
> >   ret = qcow2_co_create(create_options, errp);
> >   if (ret < 0) {
> > +
> > +Error *local_delete_err = NULL;
> > +int r_del = bdrv_co_delete_file(bs, _delete_err);
> > +/*
> > + * ENOTSUP will happen if the block driver doesn't support
> > + * the 'bdrv_co_delete_file' interface. This is a predictable
> > + * scenario and shouldn't be reported back to the user.
> > + */
> > +if ((r_del < 0) && (r_del != -ENOTSUP)) {
> > +error_report_err(local_delete_err);
> > +} else {
> > +error_free(local_delete_err);
> > +}
> >   goto finish;
> >   }
> >   
> > 
> 
> Hi!
> 
> As I understand, qcow2_co_create is a new interface and 
> qcow2_co_create_opts() is old, and now works as a wrapper on qcow2_co_create.
> 
> I think it's better to do the cleanup in qcow2_co_create, to bring the 
> feature both to new and old interface in the same way.

I think that the new interface doesn't need this fix, since 
using the new interface is only possible from qmp which 
forces the user to explicitly create and open the file 
prior to formatting it with qcow2 format.

Thus it is logical to make the user remove it as well if creation fails.

Best regards,
Maxim Levitsky

> 
>

Re: [PATCH v3 2/2] block: qcow2: remove the created file on initialization error

2020-12-08 Thread Vladimir Sementsov-Ogievskiy


08.12.2020 17:21, Maxim Levitsky wrote:

If the qcow initialization fails, we should remove the file if it was
already created, to avoid leaving stale files around.

We already do this for luks raw images.

Signed-off-by: Maxim Levitsky 
---
  block/qcow2.c | 13 +
  1 file changed, 13 insertions(+)

diff --git a/block/qcow2.c b/block/qcow2.c
index 3a90ef2786..3bc2096b72 100644
--- a/block/qcow2.c
+++ b/block/qcow2.c
@@ -3848,6 +3848,19 @@ static int coroutine_fn qcow2_co_create_opts(BlockDriver 
*drv,
  /* Create the qcow2 image (format layer) */
  ret = qcow2_co_create(create_options, errp);
  if (ret < 0) {
+
+Error *local_delete_err = NULL;
+int r_del = bdrv_co_delete_file(bs, _delete_err);
+/*
+ * ENOTSUP will happen if the block driver doesn't support
+ * the 'bdrv_co_delete_file' interface. This is a predictable
+ * scenario and shouldn't be reported back to the user.
+ */
+if ((r_del < 0) && (r_del != -ENOTSUP)) {
+error_report_err(local_delete_err);
+} else {
+error_free(local_delete_err);
+}
  goto finish;
  }
  



Hi!

As I understand, qcow2_co_create is a new interface and qcow2_co_create_opts() 
is old, and now works as a wrapper on qcow2_co_create.

I think it's better to do the cleanup in qcow2_co_create, to bring the feature 
both to new and old interface in the same way.


--
Best regards,
Vladimir

Re: [PATCH 3/3] block: Fix deadlock in bdrv_co_yield_to_drain()

2020-12-08 Thread Vladimir Sementsov-Ogievskiy


03.12.2020 20:23, Kevin Wolf wrote:

If bdrv_co_yield_to_drain() is called for draining a block node that
runs in a different AioContext, it keeps that AioContext locked while it
yields and schedules a BH in the AioContext to do the actual drain.

As long as executing the BH is the very next thing the event loop of the


s/thing the event/thing in the event/

(I've reread several times to understand :)


node's AioContext, this actually happens to work, but when it tries to
execute something else that wants to take the AioContext lock, it will
deadlock. (In the bug report, this other thing is a virtio-scsi device
running virtio_scsi_data_plane_handle_cmd().)

Instead, always drop the AioContext lock across the yield and reacquire
it only when the coroutine is reentered. The BH needs to unconditionally
take the lock for itself now.

This fixes the 'block_resize' QMP command on a block node that runs in
an iothread.

Cc: qemu-sta...@nongnu.org
Fixes: eb94b81a94bce112e6b206df846c1551aaf6cab6
Fixes: https://bugzilla.redhat.com/show_bug.cgi?id=1903511
Signed-off-by: Kevin Wolf 


I don't feel myself good enough in aio contexts acquiring and switching, to see 
any side effects. At least I don't see any obvious mistakes, so my weak:

Reviewed-by: Vladimir Sementsov-Ogievskiy 

Note, I looked through the callers:

bdrv_do_drained_begin/end should be ok, as their normal usage is to start/end 
drained section under acquired aio context, so it seems correct to temporary 
release the context. Still I didn't check all drained sections in the code.

bdrv_drain_all_begin seems OK too (we just wait until everything is drained, 
not bad to temporary release the lock). Still I don't see any call of it from 
coroutine context.

--
Best regards,
Vladimir

Re: [PATCH v3 2/2] block: qcow2: remove the created file on initialization error

2020-12-08 Thread Maxim Levitsky

On Tue, 2020-12-08 at 16:26 +0100, Alberto Garcia wrote:
> On Tue 08 Dec 2020 03:21:59 PM CET, Maxim Levitsky wrote:
> > If the qcow initialization fails, we should remove the file if it was
> > already created, to avoid leaving stale files around.
> > 
> > We already do this for luks raw images.
> > 
> > Signed-off-by: Maxim Levitsky 
> 
> Reviewed-by: Alberto Garcia 
> 
> >  ret = qcow2_co_create(create_options, errp);
> >  if (ret < 0) {
> > +
> > +Error *local_delete_err = NULL;
> 
> Why that empty line though?

I didn't notice. I can send a new version if this is needed.

Thanks for the review!

Best regards,
Maxim Levitsky
> 
> Berto
>

Re: [PATCH v3 1/2] crypto: luks: Fix tiny memory leak

2020-12-08 Thread Alberto Garcia

On Tue 08 Dec 2020 03:21:58 PM CET, Maxim Levitsky wrote:
> When the underlying block device doesn't support the
> bdrv_co_delete_file interface, an 'Error' object was leaked.
>
> Signed-off-by: Maxim Levitsky 

Reviewed-by: Alberto Garcia 

Berto

Re: [PATCH v3 2/2] block: qcow2: remove the created file on initialization error

2020-12-08 Thread Alberto Garcia

On Tue 08 Dec 2020 03:21:59 PM CET, Maxim Levitsky wrote:
> If the qcow initialization fails, we should remove the file if it was
> already created, to avoid leaving stale files around.
>
> We already do this for luks raw images.
>
> Signed-off-by: Maxim Levitsky 

Reviewed-by: Alberto Garcia 

>  ret = qcow2_co_create(create_options, errp);
>  if (ret < 0) {
> +
> +Error *local_delete_err = NULL;

Why that empty line though?

Berto

Re: [PATCH v1 0/2] Add timeout mechanism to qmp actions

2020-12-08 Thread Glauber Costa

On Tue, Dec 8, 2020 at 8:11 AM Stefan Hajnoczi  wrote:
>
> On Thu, Oct 22, 2020 at 05:29:16PM +0100, Fam Zheng wrote:
> > On Tue, 2020-10-20 at 09:34 +0800, Zhenyu Ye wrote:
> > > On 2020/10/19 21:25, Paolo Bonzini wrote:
> > > > On 19/10/20 14:40, Zhenyu Ye wrote:
> > > > > The kernel backtrace for io_submit in GUEST is:
> > > > >
> > > > > guest# ./offcputime -K -p `pgrep -nx fio`
> > > > > b'finish_task_switch'
> > > > > b'__schedule'
> > > > > b'schedule'
> > > > > b'io_schedule'
> > > > > b'blk_mq_get_tag'
> > > > > b'blk_mq_get_request'
> > > > > b'blk_mq_make_request'
> > > > > b'generic_make_request'
> > > > > b'submit_bio'
> > > > > b'blkdev_direct_IO'
> > > > > b'generic_file_read_iter'
> > > > > b'aio_read'
> > > > > b'io_submit_one'
> > > > > b'__x64_sys_io_submit'
> > > > > b'do_syscall_64'
> > > > > b'entry_SYSCALL_64_after_hwframe'
> > > > > -fio (1464)
> > > > > 40031912
> > > > >
> > > > > And Linux io_uring can avoid the latency problem.
> >
> > Thanks for the info. What this tells us is basically the inflight
> > requests are high. It's sad that the linux-aio is in practice
> > implemented as a blocking API.

it is.

> >
> > Host side backtrace will be of more help. Can you get that too?
>
> I guess Linux AIO didn't set the BLK_MQ_REQ_NOWAIT flag so the task went
> to sleep when it ran out of blk-mq tags. The easiest solution is to move
> to io_uring. Linux AIO is broken - it's not AIO :).

Agree!
>
> If we know that no other process is writing to the host block device
> then maybe we can determine the blk-mq tags limit (the queue depth) and
> avoid sending more requests. That way QEMU doesn't block, but I don't
> think this approach works when other processes are submitting I/O to the
> same host block device :(.
>
> Fam's original suggestion of invoking io_submit(2) from a worker thread
> is an option, but I'm afraid it will slow down the uncontended case.
>
> I'm CCing Glauber in case he battled this in the past in ScyllaDB.

We have, and a lot. I don't recall seeing this particular lock, but
XFS would block us all the time
if it had to update metadata to submit the operation, lock inodes, etc.

The work we did at the time was in fixing those things in the kernel
as much as we could.
But the API is just like that...

>
> Stefan

[PATCH] block/nvme: Fix possible array index out of bounds in nvme_process_completion()

2020-12-08 Thread Alex Chen

The range of 'cid' is [1, NVME_QUEUE_SIZE-1], so when 'cid' is equal to
NVME_QUEUE_SIZE, it should be continued, otherwise it will lead to array
index out of bounds when accessing 'q->reqs[cid-1]'

Reported-by: Euler Robot 
Signed-off-by: Alex Chen 
---
 block/nvme.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/block/nvme.c b/block/nvme.c
index a06a188d53..3a2b3f5486 100644
--- a/block/nvme.c
+++ b/block/nvme.c
@@ -402,7 +402,7 @@ static bool nvme_process_completion(NVMeQueuePair *q)
 q->cq_phase = !q->cq_phase;
 }
 cid = le16_to_cpu(c->cid);
-if (cid == 0 || cid > NVME_QUEUE_SIZE) {
+if (cid == 0 || cid >= NVME_QUEUE_SIZE) {
 warn_report("NVMe: Unexpected CID in completion queue: %"PRIu32", "
 "queue size: %u", cid, NVME_QUEUE_SIZE);
 continue;
-- 
2.19.1

Re: [PATCH 2/3] block: Fix locking in qmp_block_resize()

2020-12-08 Thread Vladimir Sementsov-Ogievskiy


03.12.2020 20:23, Kevin Wolf wrote:

The drain functions assume that we hold the AioContext lock of the
drained block node. Make sure to actually take the lock.

Cc: qemu-sta...@nongnu.org
Fixes: eb94b81a94bce112e6b206df846c1551aaf6cab6
Signed-off-by: Kevin Wolf 
---
  blockdev.c | 5 -
  1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/blockdev.c b/blockdev.c
index 229d2cce1b..0535a8dc9e 100644
--- a/blockdev.c
+++ b/blockdev.c
@@ -2481,13 +2481,16 @@ void coroutine_fn qmp_block_resize(bool has_device, 
const char *device,
  return;
  }
  
+bdrv_co_lock(bs);

  bdrv_drained_begin(bs);
+bdrv_co_unlock(bs);
+
  old_ctx = bdrv_co_enter(bs);
  blk_truncate(blk, size, false, PREALLOC_MODE_OFF, 0, errp);
  bdrv_co_leave(bs, old_ctx);
-bdrv_drained_end(bs);
  
  bdrv_co_lock(bs);

+bdrv_drained_end(bs);
  blk_unref(blk);
  bdrv_co_unlock(bs);
  }



Can't we just do

old_ctx = bdrv_co_enter(bs);

bdrv_drained_begin(bs);

blk_truncate(blk, size, false, PREALLOC_MODE_OFF, 0, errp);
  
bdrv_drained_end(bs);

blk_unref(blk);

bdrv_co_leave(bs, old_ctx);


? This way we have one acquire/release section instead of three in a row.. But 
then we probably need addition bdrv_ref/bdrv_unref, to not crash with final 
bdrv_co_leave after blk_unref.

Also, preexisting, but it seems not good that coroutine_fn qmp_block_resize is 
called from non-coroutine hmp_block_resize()

anyway:
Reviewed-by: Vladimir Sementsov-Ogievskiy 

--
Best regards,
Vladimir

Re: [PATCH] file-posix: detect the lock using the real file

2020-12-08 Thread Kevin Wolf

Am 08.12.2020 um 13:59 hat Li Feng geschrieben:
> This patch addresses this issue:
> When accessing a volume on an NFS filesystem without supporting the file lock,
> tools, like qemu-img, will complain "Failed to lock byte 100".
> 
> In the original code, the qemu_has_ofd_lock will test the lock on the
> "/dev/null" pseudo-file. Actually, the file.locking is per-drive property,
> which depends on the underlay filesystem.
> 
> In this patch, make the 'qemu_has_ofd_lock' with a filename be more generic
> and reasonable.
> 
> Signed-off-by: Li Feng 

Do you know any way how I could configure either the NFS server or the
NFS client such that locking would fail? For any patch related to this,
it would be good if I could even test the scenario.

For this specific patch, I think Daniel has already provided a good
explanation of the fundamental problems it has.

Kevin

[PATCH v3 2/2] block: qcow2: remove the created file on initialization error

2020-12-08 Thread Maxim Levitsky

If the qcow initialization fails, we should remove the file if it was
already created, to avoid leaving stale files around.

We already do this for luks raw images.

Signed-off-by: Maxim Levitsky 
---
 block/qcow2.c | 13 +
 1 file changed, 13 insertions(+)

diff --git a/block/qcow2.c b/block/qcow2.c
index 3a90ef2786..3bc2096b72 100644
--- a/block/qcow2.c
+++ b/block/qcow2.c
@@ -3848,6 +3848,19 @@ static int coroutine_fn qcow2_co_create_opts(BlockDriver 
*drv,
 /* Create the qcow2 image (format layer) */
 ret = qcow2_co_create(create_options, errp);
 if (ret < 0) {
+
+Error *local_delete_err = NULL;
+int r_del = bdrv_co_delete_file(bs, _delete_err);
+/*
+ * ENOTSUP will happen if the block driver doesn't support
+ * the 'bdrv_co_delete_file' interface. This is a predictable
+ * scenario and shouldn't be reported back to the user.
+ */
+if ((r_del < 0) && (r_del != -ENOTSUP)) {
+error_report_err(local_delete_err);
+} else {
+error_free(local_delete_err);
+}
 goto finish;
 }
 
-- 
2.26.2

[PATCH v3 1/2] crypto: luks: Fix tiny memory leak

2020-12-08 Thread Maxim Levitsky

When the underlying block device doesn't support the
bdrv_co_delete_file interface, an 'Error' object was leaked.

Signed-off-by: Maxim Levitsky 
---
 block/crypto.c | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/block/crypto.c b/block/crypto.c
index aef5a5721a..b3a5275132 100644
--- a/block/crypto.c
+++ b/block/crypto.c
@@ -735,6 +735,8 @@ fail:
  */
 if ((r_del < 0) && (r_del != -ENOTSUP)) {
 error_report_err(local_delete_err);
+} else {
+error_free(local_delete_err);
 }
 }
 
-- 
2.26.2

[PATCH v3 0/2] qcow2: don't leave partially initialized file on image creation

2020-12-08 Thread Maxim Levitsky

Use the bdrv_co_delete_file interface to delete the underlying
file if qcow2 initialization fails (e.g due to bad encryption secret)

This makes the qcow2 driver behave the same way as the luks driver behaves.

Buglink: https://bugzilla.redhat.com/show_bug.cgi?id=1845353

V3: addressed review feedback and reworked commit messages

Best regards,
Maxim Levitsky

Maxim Levitsky (2):
  crypto: luks: Fix tiny memory leak
  block: qcow2: remove the created file on initialization error

 block/crypto.c |  2 ++
 block/qcow2.c  | 13 +
 2 files changed, 15 insertions(+)

-- 
2.26.2

Re: [PATCH 1/3] block: Simplify qmp_block_resize() error paths

2020-12-08 Thread Vladimir Sementsov-Ogievskiy


03.12.2020 20:23, Kevin Wolf wrote:

The only thing that happens after the 'out:' label is blk_unref(blk).
However, blk = NULL in all of the error cases, so instead of jumping to
'out:', we can just return directly.

Cc: qemu-sta...@nongnu.org
Signed-off-by: Kevin Wolf 
---
  blockdev.c | 7 +++
  1 file changed, 3 insertions(+), 4 deletions(-)

diff --git a/blockdev.c b/blockdev.c
index fe6fb5dc1d..229d2cce1b 100644
--- a/blockdev.c
+++ b/blockdev.c
@@ -2468,17 +2468,17 @@ void coroutine_fn qmp_block_resize(bool has_device, 
const char *device,
  
  if (size < 0) {

  error_setg(errp, QERR_INVALID_PARAMETER_VALUE, "size", "a >0 size");
-goto out;
+return;
  }
  
  if (bdrv_op_is_blocked(bs, BLOCK_OP_TYPE_RESIZE, NULL)) {

  error_setg(errp, QERR_DEVICE_IN_USE, device);
-goto out;
+return;
  }
  
  blk = blk_new_with_bs(bs, BLK_PERM_RESIZE, BLK_PERM_ALL, errp);

  if (!blk) {
-goto out;
+return;
  }
  
  bdrv_drained_begin(bs);

@@ -2487,7 +2487,6 @@ void coroutine_fn qmp_block_resize(bool has_device, const 
char *device,
  bdrv_co_leave(bs, old_ctx);
  bdrv_drained_end(bs);
  
-out:

  bdrv_co_lock(bs);
  blk_unref(blk);
  bdrv_co_unlock(bs);



Initialization of blk to NULL becomes redundant with this patch, so may be 
dropped too. Anyway:

Reviewed-by: Vladimir Sementsov-Ogievskiy 

--
Best regards,
Vladimir

[PATCH v3] qemu-nbd: Fix a memleak in nbd_client_thread()

2020-12-08 Thread Alex Chen

When the qio_channel_socket_connect_sync() fails
we should goto 'out_socket' label to free the 'sioc' instead of
goto 'out' label.
In addition, there's a lot of redundant code in the successful branch
and the error branch, optimize it.

Reported-by: Euler Robot 
Signed-off-by: Alex Chen 
Signed-off-by: Eric Blake 
Reviewed-by: Vladimir Sementsov-Ogievskiy 
---
 qemu-nbd.c | 40 +---
 1 file changed, 17 insertions(+), 23 deletions(-)

diff --git a/qemu-nbd.c b/qemu-nbd.c
index a7075c5419..ee2fbc4cdb 100644
--- a/qemu-nbd.c
+++ b/qemu-nbd.c
@@ -265,8 +265,8 @@ static void *nbd_client_thread(void *arg)
 char *device = arg;
 NBDExportInfo info = { .request_sizes = false, .name = g_strdup("") };
 QIOChannelSocket *sioc;
-int fd;
-int ret;
+int fd = -1;
+int ret = EXIT_FAILURE;
 pthread_t show_parts_thread;
 Error *local_error = NULL;
 
@@ -278,26 +278,24 @@ static void *nbd_client_thread(void *arg)
 goto out;
 }
 
-ret = nbd_receive_negotiate(NULL, QIO_CHANNEL(sioc),
-NULL, NULL, NULL, , _error);
-if (ret < 0) {
+if (nbd_receive_negotiate(NULL, QIO_CHANNEL(sioc),
+  NULL, NULL, NULL, , _error) < 0) {
 if (local_error) {
 error_report_err(local_error);
 }
-goto out_socket;
+goto out;
 }
 
 fd = open(device, O_RDWR);
 if (fd < 0) {
 /* Linux-only, we can use %m in printf.  */
 error_report("Failed to open %s: %m", device);
-goto out_socket;
+goto out;
 }
 
-ret = nbd_init(fd, sioc, , _error);
-if (ret < 0) {
+if (nbd_init(fd, sioc, , _error) < 0) {
 error_report_err(local_error);
-goto out_fd;
+goto out;
 }
 
 /* update partition table */
@@ -311,24 +309,20 @@ static void *nbd_client_thread(void *arg)
 dup2(STDOUT_FILENO, STDERR_FILENO);
 }
 
-ret = nbd_client(fd);
-if (ret) {
-goto out_fd;
+if (nbd_client(fd) < 0) {
+goto out;
 }
-close(fd);
-object_unref(OBJECT(sioc));
-g_free(info.name);
-kill(getpid(), SIGTERM);
-return (void *) EXIT_SUCCESS;
 
-out_fd:
-close(fd);
-out_socket:
+ret = EXIT_SUCCESS;
+
+ out:
+if (fd >= 0) {
+close(fd);
+}
 object_unref(OBJECT(sioc));
-out:
 g_free(info.name);
 kill(getpid(), SIGTERM);
-return (void *) EXIT_FAILURE;
+return (void *) (intptr_t) ret;
 }
 #endif /* HAVE_NBD_DEVICE */
 
-- 
2.19.1

Re: [PATCH v2] qemu-nbd: Fix a memleak in nbd_client_thread()

2020-12-08 Thread Alex Chen

On 2020/12/8 21:41, Vladimir Sementsov-Ogievskiy wrote:
> 03.12.2020 16:58, Alex Chen wrote:
>> When the qio_channel_socket_connect_sync() fails
>> we should goto 'out_socket' label to free the 'sioc' instead of
>> goto 'out' label.
>> In addition, there's a lot of redundant code in the successful branch
>> and the error branch, optimize it.
>>
>> Reported-by: Euler Robot 
>> Signed-off-by: Alex Chen 
>> Signed-off-by: Eric Blake 
>> ---
>>   qemu-nbd.c | 38 +++---
>>   1 file changed, 15 insertions(+), 23 deletions(-)
>>
>> diff --git a/qemu-nbd.c b/qemu-nbd.c
>> index a7075c5419..9583ee1af6 100644
>> --- a/qemu-nbd.c
>> +++ b/qemu-nbd.c
>> @@ -265,8 +265,8 @@ static void *nbd_client_thread(void *arg)
>>   char *device = arg;
>>   NBDExportInfo info = { .request_sizes = false, .name = g_strdup("") };
>>   QIOChannelSocket *sioc;
>> -int fd;
>> -int ret;
>> +int fd = -1;
>> +int ret = EXIT_FAILURE;
>>   pthread_t show_parts_thread;
>>   Error *local_error = NULL;
>>   @@ -278,26 +278,24 @@ static void *nbd_client_thread(void *arg)
>>   goto out;
>>   }
>>   -ret = nbd_receive_negotiate(NULL, QIO_CHANNEL(sioc),
>> -NULL, NULL, NULL, , _error);
>> -if (ret < 0) {
>> +if (nbd_receive_negotiate(NULL, QIO_CHANNEL(sioc),
>> +  NULL, NULL, NULL, , _error) < 0) {
>>   if (local_error) {
>>   error_report_err(local_error);
>>   }
>> -goto out_socket;
>> +goto out;
>>   }
>> fd = open(device, O_RDWR);
>>   if (fd < 0) {
>>   /* Linux-only, we can use %m in printf.  */
>>   error_report("Failed to open %s: %m", device);
>> -goto out_socket;
>> +goto out;
>>   }
>>   -ret = nbd_init(fd, sioc, , _error);
>> -if (ret < 0) {
>> +if (nbd_init(fd, sioc, , _error) < 0) {
>>   error_report_err(local_error);
>> -goto out_fd;
>> +goto out;
>>   }
>> /* update partition table */
>> @@ -311,24 +309,18 @@ static void *nbd_client_thread(void *arg)
>>   dup2(STDOUT_FILENO, STDERR_FILENO);
>>   }
>>   -ret = nbd_client(fd);
>> -if (ret) {
>> -goto out_fd;
>> +if (nbd_client(fd) == 0) {
>> +ret = EXIT_SUCCESS;
> 
> It's not obvious that nbd_client() returns 0 on success, it calls ioctl(), 
> which may return something positive in theory..
> 
> So, with s/==/>=/, or with just
> 
> if (nbd_client(fd) < 0) {
>   goto out;
> }
> 
> ret = EXIT_SUCCESS;
> 
> 
> (which is good common pattern I think)
> 
> :
> 

Thanks for your review, I will fix it and send patch v3.

Thanks,
Alex

Re: [PATCH] file-posix: detect the lock using the real file

2020-12-08 Thread Daniel P . Berrangé

On Tue, Dec 08, 2020 at 08:59:37PM +0800, Li Feng wrote:
> This patch addresses this issue:
> When accessing a volume on an NFS filesystem without supporting the file lock,
> tools, like qemu-img, will complain "Failed to lock byte 100".
> 
> In the original code, the qemu_has_ofd_lock will test the lock on the
> "/dev/null" pseudo-file. Actually, the file.locking is per-drive property,
> which depends on the underlay filesystem.

IIUC, the problem you're describing is one of whether the filesystem
supports fcntl locking at all, which is indeed a per-FS check.

The QEMU code being changed though is just about detecting whether
the host OS supports OFD to not, which is supposed to be a kernel
level feature applied  universally to all FS types.

> 
> In this patch, make the 'qemu_has_ofd_lock' with a filename be more generic
> and reasonable.
> 
> Signed-off-by: Li Feng 
> ---
>  block/file-posix.c | 32 +++-
>  include/qemu/osdep.h   |  2 +-
>  tests/test-image-locking.c |  2 +-
>  util/osdep.c   | 43 --
>  4 files changed, 47 insertions(+), 32 deletions(-)
> 
> diff --git a/block/file-posix.c b/block/file-posix.c
> index 806764f7e3..03be1b188c 100644
> --- a/block/file-posix.c
> +++ b/block/file-posix.c
> @@ -595,7 +595,7 @@ static int raw_open_common(BlockDriverState *bs, QDict 
> *options,
>  switch (locking) {
>  case ON_OFF_AUTO_ON:
>  s->use_lock = true;
> -if (!qemu_has_ofd_lock()) {
> +if (!qemu_has_ofd_lock(filename)) {
>  warn_report("File lock requested but OFD locking syscall is "
>  "unavailable, falling back to POSIX file locks");
>  error_printf("Due to the implementation, locks can be lost "
> @@ -606,7 +606,7 @@ static int raw_open_common(BlockDriverState *bs, QDict 
> *options,
>  s->use_lock = false;
>  break;
>  case ON_OFF_AUTO_AUTO:
> -s->use_lock = qemu_has_ofd_lock();
> +s->use_lock = qemu_has_ofd_lock(filename);
>  break;
>  default:
>  abort();
> @@ -2388,6 +2388,7 @@ raw_co_create(BlockdevCreateOptions *options, Error 
> **errp)
>  int fd;
>  uint64_t perm, shared;
>  int result = 0;
> +bool use_lock;
>  
>  /* Validate options and set default values */
>  assert(options->driver == BLOCKDEV_DRIVER_FILE);
> @@ -2428,19 +2429,22 @@ raw_co_create(BlockdevCreateOptions *options, Error 
> **errp)
>  perm = BLK_PERM_WRITE | BLK_PERM_RESIZE;
>  shared = BLK_PERM_ALL & ~BLK_PERM_RESIZE;
>  
> -/* Step one: Take locks */
> -result = raw_apply_lock_bytes(NULL, fd, perm, ~shared, false, errp);
> -if (result < 0) {
> -goto out_close;
> -}
> +use_lock = qemu_has_ofd_lock(file_opts->filename);
> +if (use_lock) {
> +/* Step one: Take locks */
> +result = raw_apply_lock_bytes(NULL, fd, perm, ~shared, false, errp);
> +if (result < 0) {
> +goto out_close;
> +}
>  
> -/* Step two: Check that nobody else has taken conflicting locks */
> -result = raw_check_lock_bytes(fd, perm, shared, errp);
> -if (result < 0) {
> -error_append_hint(errp,
> -  "Is another process using the image [%s]?\n",
> -  file_opts->filename);
> -goto out_unlock;
> +/* Step two: Check that nobody else has taken conflicting locks */
> +result = raw_check_lock_bytes(fd, perm, shared, errp);
> +if (result < 0) {
> +error_append_hint(errp,
> +  "Is another process using the image [%s]?\n",
> +  file_opts->filename);
> +goto out_unlock;
> +}
>  }
>  
>  /* Clear the file by truncating it to 0 */
> diff --git a/include/qemu/osdep.h b/include/qemu/osdep.h
> index f9ec8c84e9..349adad465 100644
> --- a/include/qemu/osdep.h
> +++ b/include/qemu/osdep.h
> @@ -512,7 +512,7 @@ int qemu_dup(int fd);
>  int qemu_lock_fd(int fd, int64_t start, int64_t len, bool exclusive);
>  int qemu_unlock_fd(int fd, int64_t start, int64_t len);
>  int qemu_lock_fd_test(int fd, int64_t start, int64_t len, bool exclusive);
> -bool qemu_has_ofd_lock(void);
> +bool qemu_has_ofd_lock(const char *filename);
>  #endif
>  
>  #if defined(__HAIKU__) && defined(__i386__)
> diff --git a/tests/test-image-locking.c b/tests/test-image-locking.c
> index ba057bd66c..3e80246081 100644
> --- a/tests/test-image-locking.c
> +++ b/tests/test-image-locking.c
> @@ -149,7 +149,7 @@ int main(int argc, char **argv)
>  
>  g_test_init(, , NULL);
>  
> -if (qemu_has_ofd_lock()) {
> +if (qemu_has_ofd_lock(NULL)) {
>  g_test_add_func("/image-locking/basic", test_image_locking_basic);
>  g_test_add_func("/image-locking/set-perm-abort", 
> test_set_perm_abort);
>  }
> diff --git a/util/osdep.c b/util/osdep.c
> index 66d01b9160..e7e502edd1

Re: [PATCH v2] qemu-nbd: Fix a memleak in nbd_client_thread()

2020-12-08 Thread Vladimir Sementsov-Ogievskiy


03.12.2020 16:58, Alex Chen wrote:

When the qio_channel_socket_connect_sync() fails
we should goto 'out_socket' label to free the 'sioc' instead of
goto 'out' label.
In addition, there's a lot of redundant code in the successful branch
and the error branch, optimize it.

Reported-by: Euler Robot 
Signed-off-by: Alex Chen 
Signed-off-by: Eric Blake 
---
  qemu-nbd.c | 38 +++---
  1 file changed, 15 insertions(+), 23 deletions(-)

diff --git a/qemu-nbd.c b/qemu-nbd.c
index a7075c5419..9583ee1af6 100644
--- a/qemu-nbd.c
+++ b/qemu-nbd.c
@@ -265,8 +265,8 @@ static void *nbd_client_thread(void *arg)
  char *device = arg;
  NBDExportInfo info = { .request_sizes = false, .name = g_strdup("") };
  QIOChannelSocket *sioc;
-int fd;
-int ret;
+int fd = -1;
+int ret = EXIT_FAILURE;
  pthread_t show_parts_thread;
  Error *local_error = NULL;
  
@@ -278,26 +278,24 @@ static void *nbd_client_thread(void *arg)

  goto out;
  }
  
-ret = nbd_receive_negotiate(NULL, QIO_CHANNEL(sioc),

-NULL, NULL, NULL, , _error);
-if (ret < 0) {
+if (nbd_receive_negotiate(NULL, QIO_CHANNEL(sioc),
+  NULL, NULL, NULL, , _error) < 0) {
  if (local_error) {
  error_report_err(local_error);
  }
-goto out_socket;
+goto out;
  }
  
  fd = open(device, O_RDWR);

  if (fd < 0) {
  /* Linux-only, we can use %m in printf.  */
  error_report("Failed to open %s: %m", device);
-goto out_socket;
+goto out;
  }
  
-ret = nbd_init(fd, sioc, , _error);

-if (ret < 0) {
+if (nbd_init(fd, sioc, , _error) < 0) {
  error_report_err(local_error);
-goto out_fd;
+goto out;
  }
  
  /* update partition table */

@@ -311,24 +309,18 @@ static void *nbd_client_thread(void *arg)
  dup2(STDOUT_FILENO, STDERR_FILENO);
  }
  
-ret = nbd_client(fd);

-if (ret) {
-goto out_fd;
+if (nbd_client(fd) == 0) {
+ret = EXIT_SUCCESS;


It's not obvious that nbd_client() returns 0 on success, it calls ioctl(), 
which may return something positive in theory..

So, with s/==/>=/, or with just

if (nbd_client(fd) < 0) {
  goto out;
}

ret = EXIT_SUCCESS;


(which is good common pattern I think)

:

Reviewed-by: Vladimir Sementsov-Ogievskiy 

--
Best regards,
Vladimir

Re: [PATCH v1 0/2] Add timeout mechanism to qmp actions

2020-12-08 Thread Stefan Hajnoczi

On Thu, Oct 22, 2020 at 05:29:16PM +0100, Fam Zheng wrote:
> On Tue, 2020-10-20 at 09:34 +0800, Zhenyu Ye wrote:
> > On 2020/10/19 21:25, Paolo Bonzini wrote:
> > > On 19/10/20 14:40, Zhenyu Ye wrote:
> > > > The kernel backtrace for io_submit in GUEST is:
> > > > 
> > > > guest# ./offcputime -K -p `pgrep -nx fio`
> > > > b'finish_task_switch'
> > > > b'__schedule'
> > > > b'schedule'
> > > > b'io_schedule'
> > > > b'blk_mq_get_tag'
> > > > b'blk_mq_get_request'
> > > > b'blk_mq_make_request'
> > > > b'generic_make_request'
> > > > b'submit_bio'
> > > > b'blkdev_direct_IO'
> > > > b'generic_file_read_iter'
> > > > b'aio_read'
> > > > b'io_submit_one'
> > > > b'__x64_sys_io_submit'
> > > > b'do_syscall_64'
> > > > b'entry_SYSCALL_64_after_hwframe'
> > > > -fio (1464)
> > > > 40031912
> > > > 
> > > > And Linux io_uring can avoid the latency problem.
> 
> Thanks for the info. What this tells us is basically the inflight
> requests are high. It's sad that the linux-aio is in practice
> implemented as a blocking API.
> 
> Host side backtrace will be of more help. Can you get that too?

I guess Linux AIO didn't set the BLK_MQ_REQ_NOWAIT flag so the task went
to sleep when it ran out of blk-mq tags. The easiest solution is to move
to io_uring. Linux AIO is broken - it's not AIO :).

If we know that no other process is writing to the host block device
then maybe we can determine the blk-mq tags limit (the queue depth) and
avoid sending more requests. That way QEMU doesn't block, but I don't
think this approach works when other processes are submitting I/O to the
same host block device :(.

Fam's original suggestion of invoking io_submit(2) from a worker thread
is an option, but I'm afraid it will slow down the uncontended case.

I'm CCing Glauber in case he battled this in the past in ScyllaDB.

Stefan

signature.asc
Description: PGP signature

[PATCH] file-posix: detect the lock using the real file

2020-12-08 Thread Li Feng

This patch addresses this issue:
When accessing a volume on an NFS filesystem without supporting the file lock,
tools, like qemu-img, will complain "Failed to lock byte 100".

In the original code, the qemu_has_ofd_lock will test the lock on the
"/dev/null" pseudo-file. Actually, the file.locking is per-drive property,
which depends on the underlay filesystem.

In this patch, make the 'qemu_has_ofd_lock' with a filename be more generic
and reasonable.

Signed-off-by: Li Feng 
---
 block/file-posix.c | 32 +++-
 include/qemu/osdep.h   |  2 +-
 tests/test-image-locking.c |  2 +-
 util/osdep.c   | 43 --
 4 files changed, 47 insertions(+), 32 deletions(-)

diff --git a/block/file-posix.c b/block/file-posix.c
index 806764f7e3..03be1b188c 100644
--- a/block/file-posix.c
+++ b/block/file-posix.c
@@ -595,7 +595,7 @@ static int raw_open_common(BlockDriverState *bs, QDict 
*options,
 switch (locking) {
 case ON_OFF_AUTO_ON:
 s->use_lock = true;
-if (!qemu_has_ofd_lock()) {
+if (!qemu_has_ofd_lock(filename)) {
 warn_report("File lock requested but OFD locking syscall is "
 "unavailable, falling back to POSIX file locks");
 error_printf("Due to the implementation, locks can be lost "
@@ -606,7 +606,7 @@ static int raw_open_common(BlockDriverState *bs, QDict 
*options,
 s->use_lock = false;
 break;
 case ON_OFF_AUTO_AUTO:
-s->use_lock = qemu_has_ofd_lock();
+s->use_lock = qemu_has_ofd_lock(filename);
 break;
 default:
 abort();
@@ -2388,6 +2388,7 @@ raw_co_create(BlockdevCreateOptions *options, Error 
**errp)
 int fd;
 uint64_t perm, shared;
 int result = 0;
+bool use_lock;
 
 /* Validate options and set default values */
 assert(options->driver == BLOCKDEV_DRIVER_FILE);
@@ -2428,19 +2429,22 @@ raw_co_create(BlockdevCreateOptions *options, Error 
**errp)
 perm = BLK_PERM_WRITE | BLK_PERM_RESIZE;
 shared = BLK_PERM_ALL & ~BLK_PERM_RESIZE;
 
-/* Step one: Take locks */
-result = raw_apply_lock_bytes(NULL, fd, perm, ~shared, false, errp);
-if (result < 0) {
-goto out_close;
-}
+use_lock = qemu_has_ofd_lock(file_opts->filename);
+if (use_lock) {
+/* Step one: Take locks */
+result = raw_apply_lock_bytes(NULL, fd, perm, ~shared, false, errp);
+if (result < 0) {
+goto out_close;
+}
 
-/* Step two: Check that nobody else has taken conflicting locks */
-result = raw_check_lock_bytes(fd, perm, shared, errp);
-if (result < 0) {
-error_append_hint(errp,
-  "Is another process using the image [%s]?\n",
-  file_opts->filename);
-goto out_unlock;
+/* Step two: Check that nobody else has taken conflicting locks */
+result = raw_check_lock_bytes(fd, perm, shared, errp);
+if (result < 0) {
+error_append_hint(errp,
+  "Is another process using the image [%s]?\n",
+  file_opts->filename);
+goto out_unlock;
+}
 }
 
 /* Clear the file by truncating it to 0 */
diff --git a/include/qemu/osdep.h b/include/qemu/osdep.h
index f9ec8c84e9..349adad465 100644
--- a/include/qemu/osdep.h
+++ b/include/qemu/osdep.h
@@ -512,7 +512,7 @@ int qemu_dup(int fd);
 int qemu_lock_fd(int fd, int64_t start, int64_t len, bool exclusive);
 int qemu_unlock_fd(int fd, int64_t start, int64_t len);
 int qemu_lock_fd_test(int fd, int64_t start, int64_t len, bool exclusive);
-bool qemu_has_ofd_lock(void);
+bool qemu_has_ofd_lock(const char *filename);
 #endif
 
 #if defined(__HAIKU__) && defined(__i386__)
diff --git a/tests/test-image-locking.c b/tests/test-image-locking.c
index ba057bd66c..3e80246081 100644
--- a/tests/test-image-locking.c
+++ b/tests/test-image-locking.c
@@ -149,7 +149,7 @@ int main(int argc, char **argv)
 
 g_test_init(, , NULL);
 
-if (qemu_has_ofd_lock()) {
+if (qemu_has_ofd_lock(NULL)) {
 g_test_add_func("/image-locking/basic", test_image_locking_basic);
 g_test_add_func("/image-locking/set-perm-abort", test_set_perm_abort);
 }
diff --git a/util/osdep.c b/util/osdep.c
index 66d01b9160..e7e502edd1 100644
--- a/util/osdep.c
+++ b/util/osdep.c
@@ -42,6 +42,7 @@ extern int madvise(char *, size_t, int);
 static bool fips_enabled = false;
 
 static const char *hw_version = QEMU_HW_VERSION;
+static const char *null_device = "/dev/null";
 
 int socket_set_cork(int fd, int v)
 {
@@ -187,11 +188,10 @@ static int qemu_parse_fdset(const char *param)
 return qemu_parse_fd(param);
 }
 
-static void qemu_probe_lock_ops(void)
+static void qemu_probe_lock_ops_fd(int fd)
 {
 if (fcntl_op_setlk == -1) {
 #ifdef F_OFD_SETLK
-int fd;
 int ret;
 struct flock fl = {
 .l_whence =

[PATCH] hw/block/nvme: fix bad clearing of CAP

2020-12-08 Thread Klaus Jensen

From: Klaus Jensen 

Commit 37712e00b1f0 ("hw/block/nvme: factor out pmr setup") changed the
control flow such that the CAP register is erronously cleared after
nvme_init_pmr() has configured it. Since the entire NvmeCtrl structure
is zero-filled initially, there is no need for the explicit clearing, so
just remove it.

Fixes: 37712e00b1f0 ("hw/block/nvme: factor out pmr setup")
Signed-off-by: Klaus Jensen 
---
 hw/block/nvme.c | 1 -
 1 file changed, 1 deletion(-)

diff --git a/hw/block/nvme.c b/hw/block/nvme.c
index 8814201364c1..28416b18a5c0 100644
--- a/hw/block/nvme.c
+++ b/hw/block/nvme.c
@@ -3040,7 +3040,6 @@ static void nvme_init_ctrl(NvmeCtrl *n, PCIDevice 
*pci_dev)
 id->psd[0].enlat = cpu_to_le32(0x10);
 id->psd[0].exlat = cpu_to_le32(0x4);
 
-n->bar.cap = 0;
 NVME_CAP_SET_MQES(n->bar.cap, 0x7ff);
 NVME_CAP_SET_CQR(n->bar.cap, 1);
 NVME_CAP_SET_TO(n->bar.cap, 0xf);
-- 
2.29.2

[PATCH v3 2/2] hw/block/nvme: add simple copy command

2020-12-08 Thread Klaus Jensen

From: Klaus Jensen 

Add support for TP 4065a ("Simple Copy Command"), v2020.05.04
("Ratified").

The implementation uses a bounce buffer to first read in the source
logical blocks, then issue a write of that bounce buffer. The default
maximum number of source logical blocks is 128, translating to 512 KiB
for 4k logical blocks which aligns with the default value of MDTS.

Signed-off-by: Klaus Jensen 
---
 hw/block/nvme-ns.h|   4 +
 hw/block/nvme.h   |   1 +
 hw/block/nvme-ns.c|   8 ++
 hw/block/nvme.c   | 224 +-
 hw/block/trace-events |   6 ++
 5 files changed, 242 insertions(+), 1 deletion(-)

diff --git a/hw/block/nvme-ns.h b/hw/block/nvme-ns.h
index 44bf6271b744..745d288b09cf 100644
--- a/hw/block/nvme-ns.h
+++ b/hw/block/nvme-ns.h
@@ -21,6 +21,10 @@
 
 typedef struct NvmeNamespaceParams {
 uint32_t nsid;
+
+uint16_t mssrl;
+uint32_t mcl;
+uint8_t  msrc;
 } NvmeNamespaceParams;
 
 typedef struct NvmeNamespace {
diff --git a/hw/block/nvme.h b/hw/block/nvme.h
index 574333caa3f9..f549abeeb930 100644
--- a/hw/block/nvme.h
+++ b/hw/block/nvme.h
@@ -62,6 +62,7 @@ static inline const char *nvme_io_opc_str(uint8_t opc)
 case NVME_CMD_READ: return "NVME_NVM_CMD_READ";
 case NVME_CMD_WRITE_ZEROES: return "NVME_NVM_CMD_WRITE_ZEROES";
 case NVME_CMD_DSM:  return "NVME_NVM_CMD_DSM";
+case NVME_CMD_COPY: return "NVME_NVM_CMD_COPY";
 default:return "NVME_NVM_CMD_UNKNOWN";
 }
 }
diff --git a/hw/block/nvme-ns.c b/hw/block/nvme-ns.c
index 2d69b5177b51..f53f8fc56fd8 100644
--- a/hw/block/nvme-ns.c
+++ b/hw/block/nvme-ns.c
@@ -59,6 +59,11 @@ static int nvme_ns_init(NvmeNamespace *ns, Error **errp)
 
 id_ns->npda = id_ns->npdg = npdg - 1;
 
+/* simple copy */
+id_ns->mssrl = cpu_to_le16(ns->params.mssrl);
+id_ns->mcl = cpu_to_le32(ns->params.mcl);
+id_ns->msrc = ns->params.msrc;
+
 return 0;
 }
 
@@ -150,6 +155,9 @@ static void nvme_ns_realize(DeviceState *dev, Error **errp)
 static Property nvme_ns_props[] = {
 DEFINE_BLOCK_PROPERTIES(NvmeNamespace, blkconf),
 DEFINE_PROP_UINT32("nsid", NvmeNamespace, params.nsid, 0),
+DEFINE_PROP_UINT16("mssrl", NvmeNamespace, params.mssrl, 128),
+DEFINE_PROP_UINT32("mcl", NvmeNamespace, params.mcl, 128),
+DEFINE_PROP_UINT8("msrc", NvmeNamespace, params.msrc, 127),
 DEFINE_PROP_END_OF_LIST(),
 };
 
diff --git a/hw/block/nvme.c b/hw/block/nvme.c
index 8814201364c1..d06ffab7e684 100644
--- a/hw/block/nvme.c
+++ b/hw/block/nvme.c
@@ -999,6 +999,109 @@ static void nvme_aio_discard_cb(void *opaque, int ret)
 nvme_enqueue_req_completion(nvme_cq(req), req);
 }
 
+struct nvme_copy_ctx {
+int copies;
+uint8_t *bounce;
+uint32_t nlb;
+};
+
+struct nvme_copy_in_ctx {
+NvmeRequest *req;
+QEMUIOVector iov;
+};
+
+static void nvme_copy_cb(void *opaque, int ret)
+{
+NvmeRequest *req = opaque;
+NvmeNamespace *ns = req->ns;
+struct nvme_copy_ctx *ctx = req->opaque;
+
+trace_pci_nvme_copy_cb(nvme_cid(req));
+
+if (!ret) {
+block_acct_done(blk_get_stats(ns->blkconf.blk), >acct);
+} else {
+block_acct_failed(blk_get_stats(ns->blkconf.blk), >acct);
+nvme_aio_err(req, ret);
+}
+
+g_free(ctx->bounce);
+g_free(ctx);
+
+nvme_enqueue_req_completion(nvme_cq(req), req);
+}
+
+static void nvme_copy_in_complete(NvmeRequest *req)
+{
+NvmeNamespace *ns = req->ns;
+NvmeCopyCmd *copy = (NvmeCopyCmd *)>cmd;
+struct nvme_copy_ctx *ctx = req->opaque;
+uint64_t sdlba = le64_to_cpu(copy->sdlba);
+uint16_t status;
+
+trace_pci_nvme_copy_in_complete(nvme_cid(req));
+
+block_acct_done(blk_get_stats(ns->blkconf.blk), >acct);
+
+status = nvme_check_bounds(ns, sdlba, ctx->nlb);
+if (status) {
+trace_pci_nvme_err_invalid_lba_range(sdlba, ctx->nlb, ns->id_ns.nsze);
+req->status = status;
+
+g_free(ctx->bounce);
+g_free(ctx);
+
+nvme_enqueue_req_completion(nvme_cq(req), req);
+
+return;
+}
+
+qemu_iovec_init(>iov, 1);
+qemu_iovec_add(>iov, ctx->bounce, nvme_l2b(ns, ctx->nlb));
+
+block_acct_start(blk_get_stats(ns->blkconf.blk), >acct,
+ nvme_l2b(ns, ctx->nlb), BLOCK_ACCT_WRITE);
+
+req->aiocb = blk_aio_pwritev(ns->blkconf.blk, nvme_l2b(ns, sdlba),
+ >iov, 0, nvme_copy_cb, req);
+}
+
+static void nvme_aio_copy_in_cb(void *opaque, int ret)
+{
+struct nvme_copy_in_ctx *in_ctx = opaque;
+NvmeRequest *req = in_ctx->req;
+NvmeNamespace *ns = req->ns;
+struct nvme_copy_ctx *ctx = req->opaque;
+
+qemu_iovec_destroy(_ctx->iov);
+g_free(in_ctx);
+
+trace_pci_nvme_aio_copy_in_cb(nvme_cid(req));
+
+if (ret) {
+nvme_aio_err(req, ret);
+}
+
+ctx->copies--;
+
+if (ctx->copies) {
+return;
+}
+
+if (req->status) {
+

[PATCH v3 1/2] nvme: updated shared header for copy command

2020-12-08 Thread Klaus Jensen

From: Klaus Jensen 

Add new data structures and types for the Simple Copy command.

Signed-off-by: Klaus Jensen 
Cc: Stefan Hajnoczi 
Cc: Fam Zheng 
Reviewed-by: Minwoo Im 
Acked-by: Stefan Hajnoczi 
---
 include/block/nvme.h | 45 ++--
 1 file changed, 43 insertions(+), 2 deletions(-)

diff --git a/include/block/nvme.h b/include/block/nvme.h
index e95ff6ca9b37..be3aca913a1d 100644
--- a/include/block/nvme.h
+++ b/include/block/nvme.h
@@ -472,6 +472,7 @@ enum NvmeIoCommands {
 NVME_CMD_COMPARE= 0x05,
 NVME_CMD_WRITE_ZEROES   = 0x08,
 NVME_CMD_DSM= 0x09,
+NVME_CMD_COPY   = 0x19,
 };
 
 typedef struct QEMU_PACKED NvmeDeleteQ {
@@ -603,6 +604,35 @@ typedef struct QEMU_PACKED NvmeDsmRange {
 uint64_tslba;
 } NvmeDsmRange;
 
+enum {
+NVME_COPY_FORMAT_0 = 0x0,
+};
+
+typedef struct NvmeCopyCmd {
+uint8_t opcode;
+uint8_t flags;
+uint16_tcid;
+uint32_tnsid;
+uint32_trsvd2[4];
+NvmeCmdDptr dptr;
+uint64_tsdlba;
+uint32_tcdw12;
+uint32_tcdw13;
+uint32_tilbrt;
+uint16_tlbat;
+uint16_tlbatm;
+} NvmeCopyCmd;
+
+typedef struct NvmeCopySourceRange {
+uint8_t  rsvd0[8];
+uint64_t slba;
+uint16_t nlb;
+uint8_t  rsvd18[6];
+uint32_t eilbrt;
+uint16_t elbat;
+uint16_t elbatm;
+} NvmeCopySourceRange;
+
 enum NvmeAsyncEventRequest {
 NVME_AER_TYPE_ERROR = 0,
 NVME_AER_TYPE_SMART = 1,
@@ -680,6 +710,7 @@ enum NvmeStatusCodes {
 NVME_CONFLICTING_ATTRS  = 0x0180,
 NVME_INVALID_PROT_INFO  = 0x0181,
 NVME_WRITE_TO_RO= 0x0182,
+NVME_CMD_SIZE_LIMIT = 0x0183,
 NVME_WRITE_FAULT= 0x0280,
 NVME_UNRECOVERED_READ   = 0x0281,
 NVME_E2E_GUARD_ERROR= 0x0282,
@@ -831,7 +862,7 @@ typedef struct QEMU_PACKED NvmeIdCtrl {
 uint8_t nvscc;
 uint8_t rsvd531;
 uint16_tacwu;
-uint8_t rsvd534[2];
+uint16_tocfs;
 uint32_tsgls;
 uint8_t rsvd540[228];
 uint8_t subnqn[256];
@@ -854,6 +885,11 @@ enum NvmeIdCtrlOncs {
 NVME_ONCS_FEATURES  = 1 << 4,
 NVME_ONCS_RESRVATIONS   = 1 << 5,
 NVME_ONCS_TIMESTAMP = 1 << 6,
+NVME_ONCS_COPY  = 1 << 8,
+};
+
+enum NvmeIdCtrlOcfs {
+NVME_OCFS_COPY_FORMAT_0 = 1 << 0,
 };
 
 enum NvmeIdCtrlFrmw {
@@ -995,7 +1031,10 @@ typedef struct QEMU_PACKED NvmeIdNs {
 uint16_tnpdg;
 uint16_tnpda;
 uint16_tnows;
-uint8_t rsvd74[30];
+uint16_tmssrl;
+uint32_tmcl;
+uint8_t msrc;
+uint8_t rsvd81[23];
 uint8_t nguid[16];
 uint64_teui64;
 NvmeLBAFlbaf[16];
@@ -1059,6 +1098,7 @@ static inline void _nvme_check_size(void)
 QEMU_BUILD_BUG_ON(sizeof(NvmeAerResult) != 4);
 QEMU_BUILD_BUG_ON(sizeof(NvmeCqe) != 16);
 QEMU_BUILD_BUG_ON(sizeof(NvmeDsmRange) != 16);
+QEMU_BUILD_BUG_ON(sizeof(NvmeCopySourceRange) != 32);
 QEMU_BUILD_BUG_ON(sizeof(NvmeCmd) != 64);
 QEMU_BUILD_BUG_ON(sizeof(NvmeDeleteQ) != 64);
 QEMU_BUILD_BUG_ON(sizeof(NvmeCreateCq) != 64);
@@ -1066,6 +1106,7 @@ static inline void _nvme_check_size(void)
 QEMU_BUILD_BUG_ON(sizeof(NvmeIdentify) != 64);
 QEMU_BUILD_BUG_ON(sizeof(NvmeRwCmd) != 64);
 QEMU_BUILD_BUG_ON(sizeof(NvmeDsmCmd) != 64);
+QEMU_BUILD_BUG_ON(sizeof(NvmeCopyCmd) != 64);
 QEMU_BUILD_BUG_ON(sizeof(NvmeRangeType) != 64);
 QEMU_BUILD_BUG_ON(sizeof(NvmeErrorLog) != 64);
 QEMU_BUILD_BUG_ON(sizeof(NvmeFwSlotInfoLog) != 512);
-- 
2.29.2

[PATCH v3 0/2] hw/block/nvme: add simple copy command

2020-12-08 Thread Klaus Jensen

From: Klaus Jensen 

Add support for TP 4065 ("Simple Copy Command").

Changes for v3

  * rebased on nvme-next
  * changed the default msrc value to a more reasonable 127 from 255 to
better align with the default mcl value of 128.

Changes for v2

  * prefer style that aligns with existing NvmeIdCtrl field enums
(Minwoo)
  * swapped elbat/elbatm fields in copy source range. I've kept the R-b
and A-b from Minwoo and Stefan since this is a non-functional change
(the device does not use these fields at all).

Klaus Jensen (2):
  nvme: updated shared header for copy command
  hw/block/nvme: add simple copy command

 hw/block/nvme-ns.h|   4 +
 hw/block/nvme.h   |   1 +
 include/block/nvme.h  |  45 -
 hw/block/nvme-ns.c|   8 ++
 hw/block/nvme.c   | 224 +-
 hw/block/trace-events |   6 ++
 6 files changed, 285 insertions(+), 3 deletions(-)

-- 
2.29.2

Re: [PATCH v2] hw/block/nvme: add compare command

2020-12-08 Thread Klaus Jensen

On Nov 27 07:21, Minwoo Im wrote:
> Hello,
> 
> On Fri, Nov 27, 2020 at 3:56 AM Klaus Jensen  wrote:
> >
> > From: Gollu Appalanaidu 
> >
> > Add the Compare command.
> >
> > This implementation uses a bounce buffer to read in the data from
> > storage and then compare with the host supplied buffer.
> >
> > Signed-off-by: Gollu Appalanaidu 
> > [k.jensen: rebased]
> > Signed-off-by: Klaus Jensen 
> 
> 
> Reviewed-by: Minwoo Im 
> 

Thanks, applied to nvme-next.


signature.asc
Description: PGP signature

57 matches

Mail list logo