[PATCH v5 14/14] hw/block/nvme: Document zoned parameters in usage text

2020-09-27 Thread Dmitry Fomichev
Added brief descriptions of the new device properties that are
now available to users to configure features of Zoned Namespace
Command Set in the emulator.

This patch is for documentation only, no functionality change.

Signed-off-by: Dmitry Fomichev 
---
 hw/block/nvme.c | 44 ++--
 1 file changed, 42 insertions(+), 2 deletions(-)

diff --git a/hw/block/nvme.c b/hw/block/nvme.c
index ff7d43d38f..34fc6daf9d 100644
--- a/hw/block/nvme.c
+++ b/hw/block/nvme.c
@@ -9,7 +9,7 @@
  */
 
 /**
- * Reference Specs: http://www.nvmexpress.org, 1.2, 1.1, 1.0e
+ * Reference Specs: http://www.nvmexpress.org, 1.4, 1.3, 1.2, 1.1, 1.0e
  *
  *  https://nvmexpress.org/developers/nvme-specification/
  */
@@ -23,7 +23,8 @@
  *  max_ioqpairs=, \
  *  aerl=, aer_max_queued=, \
  *  mdts=
- *  -device nvme-ns,drive=,bus=bus_name,nsid=
+ *  -device nvme-ns,drive=,bus=bus_name,nsid=, \
+ *  zoned=
  *
  * Note cmb_size_mb denotes size of CMB in MB. CMB is assumed to be at
  * offset 0 in BAR2 and supports only WDS, RDS and SQS for now.
@@ -49,6 +50,45 @@
  *   completion when there are no oustanding AERs. When the maximum number of
  *   enqueued events are reached, subsequent events will be dropped.
  *
+ * Setting `zoned` to true selects Zoned Command Set at the namespace.
+ * In this case, the following options are available to configure zoned
+ * operation:
+ * zone_size=
+ *
+ * zone_capacity=
+ * The value 0 (default) forces zone capacity to be the same as zone
+ * size. The value of this property may not exceed zone size.
+ *
+ * zone_file=
+ * Zone metadata file, if specified, allows zone information
+ * to be persistent across shutdowns and restarts.
+ *
+ * zone_descr_ext_size=
+ * This value needs to be specified in 64B units. If it is zero,
+ * namespace(s) will not support zone descriptor extensions.
+ *
+ * max_active=
+ *
+ * max_open=
+ *
+ * zone_append_size_limit=
+ * The maximum I/O size that can be supported by Zone Append
+ * command. Since internally this this value is maintained as
+ * ZASL = log2( / ), some
+ * values assigned to this property may be rounded down and
+ * result in a lower maximum ZA data size being in effect.
+ * If MDTS property is not assigned, the default value of 128KiB is
+ * used as ZASL.
+ *
+ * offline_zones=
+ *
+ * rdonly_zones=
+ *
+ * cross_zone_read=
+ *
+ * fill_pattern=
+ * The byte pattern to return for any portions of unwritten data
+ * during read.
  */
 
 #include "qemu/osdep.h"
-- 
2.21.0




[PATCH v5 12/14] hw/block/nvme: Add injection of Offline/Read-Only zones

2020-09-27 Thread Dmitry Fomichev
ZNS specification defines two zone conditions for the zones that no
longer can function properly, possibly because of flash wear or other
internal fault. It is useful to be able to "inject" a small number of
such zones for testing purposes.

This commit defines two optional device properties, "offline_zones"
and "rdonly_zones". Users can assign non-zero values to these variables
to specify the number of zones to be initialized as Offline or
Read-Only. The actual number of injected zones may be smaller than the
requested amount - Read-Only and Offline counts are expected to be much
smaller than the total number of zones on a drive.

Signed-off-by: Dmitry Fomichev 
---
 hw/block/nvme-ns.c | 64 ++
 hw/block/nvme-ns.h |  2 ++
 hw/block/nvme.c|  1 -
 3 files changed, 66 insertions(+), 1 deletion(-)

diff --git a/hw/block/nvme-ns.c b/hw/block/nvme-ns.c
index 60156dfeaf..47751f2d54 100644
--- a/hw/block/nvme-ns.c
+++ b/hw/block/nvme-ns.c
@@ -21,6 +21,7 @@
 #include "sysemu/sysemu.h"
 #include "sysemu/block-backend.h"
 #include "qapi/error.h"
+#include "crypto/random.h"
 
 #include "hw/qdev-properties.h"
 #include "hw/qdev-core.h"
@@ -192,6 +193,32 @@ static int nvme_calc_zone_geometry(NvmeNamespace *ns, 
Error **errp)
 return -1;
 }
 
+if (ns->params.zd_extension_size) {
+if (ns->params.zd_extension_size & 0x3f) {
+error_setg(errp,
+"zone descriptor extension size must be a multiple of 64B");
+return -1;
+}
+if ((ns->params.zd_extension_size >> 6) > 0xff) {
+error_setg(errp, "zone descriptor extension size is too large");
+return -1;
+}
+}
+
+if (ns->params.max_open_zones < nz) {
+if (ns->params.nr_offline_zones > nz - ns->params.max_open_zones) {
+error_setg(errp, "offline_zones value %u is too large",
+ns->params.nr_offline_zones);
+return -1;
+}
+if (ns->params.nr_rdonly_zones >
+nz - ns->params.max_open_zones - ns->params.nr_offline_zones) {
+error_setg(errp, "rdonly_zones value %u is too large",
+ns->params.nr_rdonly_zones);
+return -1;
+}
+}
+
 return 0;
 }
 
@@ -200,7 +227,9 @@ static void nvme_init_zone_meta(NvmeNamespace *ns)
 uint64_t start = 0, zone_size = ns->zone_size;
 uint64_t capacity = ns->num_zones * zone_size;
 NvmeZone *zone;
+uint32_t rnd;
 int i;
+uint16_t zs;
 
 ns->zone_array = g_malloc0(ns->zone_array_size);
 ns->exp_open_zones = g_malloc0(sizeof(NvmeZoneList));
@@ -233,6 +262,37 @@ static void nvme_init_zone_meta(NvmeNamespace *ns)
 zone->next = 0;
 start += zone_size;
 }
+
+/* If required, make some zones Offline or Read Only */
+
+for (i = 0; i < ns->params.nr_offline_zones; i++) {
+do {
+qcrypto_random_bytes(, sizeof(rnd), NULL);
+rnd %= ns->num_zones;
+} while (rnd < ns->params.max_open_zones);
+zone = >zone_array[rnd];
+zs = nvme_get_zone_state(zone);
+if (zs != NVME_ZONE_STATE_OFFLINE) {
+nvme_set_zone_state(zone, NVME_ZONE_STATE_OFFLINE);
+} else {
+i--;
+}
+}
+
+for (i = 0; i < ns->params.nr_rdonly_zones; i++) {
+do {
+qcrypto_random_bytes(, sizeof(rnd), NULL);
+rnd %= ns->num_zones;
+} while (rnd < ns->params.max_open_zones);
+zone = >zone_array[rnd];
+zs = nvme_get_zone_state(zone);
+if (zs != NVME_ZONE_STATE_OFFLINE &&
+zs != NVME_ZONE_STATE_READ_ONLY) {
+nvme_set_zone_state(zone, NVME_ZONE_STATE_READ_ONLY);
+} else {
+i--;
+}
+}
 }
 
 static int nvme_zoned_init_ns(NvmeCtrl *n, NvmeNamespace *ns, int lba_index,
@@ -358,6 +418,10 @@ static Property nvme_ns_props[] = {
 DEFINE_PROP_UINT32("max_open", NvmeNamespace, params.max_open_zones, 0),
 DEFINE_PROP_UINT32("zone_descr_ext_size", NvmeNamespace,
params.zd_extension_size, 0),
+DEFINE_PROP_UINT32("offline_zones", NvmeNamespace,
+   params.nr_offline_zones, 0),
+DEFINE_PROP_UINT32("rdonly_zones", NvmeNamespace,
+   params.nr_rdonly_zones, 0),
 DEFINE_PROP_END_OF_LIST(),
 };
 
diff --git a/hw/block/nvme-ns.h b/hw/block/nvme-ns.h
index ed14644e09..e9b90f9677 100644
--- a/hw/block/nvme-ns.h
+++ b/hw/block/nvme-ns.h
@@ -48,6 +48,8 @@ typedef struct NvmeNamespaceParams {
 uint32_t max_active_zones;
 uint32_t max_open_zones;
 uint32_t zd_extension_size;
+uint32_t nr_offline_zones;
+uint32_t nr_rdonly_zones;
 } NvmeNamespaceParams;
 
 typedef struct NvmeNamespace {
diff --git a/hw/block/nvme.c b/hw/block/nvme.c
index 27d191c659..80973f3ff6 100644
--- a/hw/block/nvme.c
+++ b/hw/block/nvme.c
@@ -54,7 +54,6 @@
 #include "qemu/osdep.h"
 #include 

[PATCH v5 13/14] hw/block/nvme: Use zone metadata file for persistence

2020-09-27 Thread Dmitry Fomichev
A ZNS drive that is emulated by this module is currently initialized
with all zones Empty upon startup. However, actual ZNS SSDs save the
state and condition of all zones in their internal NVRAM in the event
of power loss. When such a drive is powered up again, it closes or
finishes all zones that were open at the moment of shutdown. Besides
that, the write pointer position as well as the state and condition
of all zones is preserved across power-downs.

This commit adds the capability to have a persistent zone metadata
to the device. The new optional module property, "zone_file",
is introduced. If added to the command line, this property specifies
the name of the file that stores the zone metadata. If "zone_file" is
omitted, the device will be initialized with all zones empty, the same
as before.

If zone metadata is configured to be persistent, then zone descriptor
extensions also persist across controller shutdowns.

Signed-off-by: Dmitry Fomichev 
---
 hw/block/nvme-ns.c| 341 --
 hw/block/nvme-ns.h|  33 
 hw/block/nvme.c   |   2 +
 hw/block/trace-events |   1 +
 4 files changed, 362 insertions(+), 15 deletions(-)

diff --git a/hw/block/nvme-ns.c b/hw/block/nvme-ns.c
index 47751f2d54..a94021da81 100644
--- a/hw/block/nvme-ns.c
+++ b/hw/block/nvme-ns.c
@@ -20,12 +20,15 @@
 #include "hw/pci/pci.h"
 #include "sysemu/sysemu.h"
 #include "sysemu/block-backend.h"
+#include "sysemu/hostmem.h"
+#include "qom/object_interfaces.h"
 #include "qapi/error.h"
 #include "crypto/random.h"
 
 #include "hw/qdev-properties.h"
 #include "hw/qdev-core.h"
 
+#include "trace.h"
 #include "nvme.h"
 #include "nvme-ns.h"
 
@@ -98,6 +101,7 @@ void nvme_add_zone_tail(NvmeNamespace *ns, NvmeZoneList *zl, 
NvmeZone *zone)
 zl->tail = idx;
 }
 zl->size++;
+nvme_set_zone_meta_dirty(ns);
 }
 
 /*
@@ -113,12 +117,15 @@ void nvme_remove_zone(NvmeNamespace *ns, NvmeZoneList 
*zl, NvmeZone *zone)
 if (zl->size == 0) {
 zl->head = NVME_ZONE_LIST_NIL;
 zl->tail = NVME_ZONE_LIST_NIL;
+nvme_set_zone_meta_dirty(ns);
 } else if (idx == zl->head) {
 zl->head = zone->next;
 ns->zone_array[zl->head].prev = NVME_ZONE_LIST_NIL;
+nvme_set_zone_meta_dirty(ns);
 } else if (idx == zl->tail) {
 zl->tail = zone->prev;
 ns->zone_array[zl->tail].next = NVME_ZONE_LIST_NIL;
+nvme_set_zone_meta_dirty(ns);
 } else {
 ns->zone_array[zone->next].prev = zone->prev;
 ns->zone_array[zone->prev].next = zone->next;
@@ -144,6 +151,7 @@ NvmeZone *nvme_remove_zone_head(NvmeNamespace *ns, 
NvmeZoneList *zl)
 ns->zone_array[zl->head].prev = NVME_ZONE_LIST_NIL;
 }
 zone->prev = zone->next = 0;
+nvme_set_zone_meta_dirty(ns);
 }
 
 return zone;
@@ -219,11 +227,119 @@ static int nvme_calc_zone_geometry(NvmeNamespace *ns, 
Error **errp)
 }
 }
 
+ns->meta_size = sizeof(NvmeZoneMeta) + ns->zone_array_size +
+  nz * ns->params.zd_extension_size;
+ns->meta_size = ROUND_UP(ns->meta_size, qemu_real_host_page_size);
+
+return 0;
+}
+
+static int nvme_validate_zone_file(NvmeNamespace *ns, uint64_t capacity)
+{
+NvmeZoneMeta *meta = ns->zone_meta;
+NvmeZone *zone = ns->zone_array;
+uint64_t start = 0, zone_size = ns->zone_size;
+int i, n_imp_open = 0, n_exp_open = 0, n_closed = 0, n_full = 0;
+
+if (meta->magic != NVME_ZONE_META_MAGIC) {
+return 1;
+}
+if (meta->version != NVME_ZONE_META_VER) {
+return 2;
+}
+if (meta->zone_size != zone_size) {
+return 3;
+}
+if (meta->zone_capacity != ns->zone_capacity) {
+return 4;
+}
+if (meta->nr_offline_zones != ns->params.nr_offline_zones) {
+return 5;
+}
+if (meta->nr_rdonly_zones != ns->params.nr_rdonly_zones) {
+return 6;
+}
+if (meta->lba_size != ns->blkconf.logical_block_size) {
+return 7;
+}
+if (meta->zd_extension_size != ns->params.zd_extension_size) {
+return 8;
+}
+
+for (i = 0; i < ns->num_zones; i++, zone++) {
+if (start + zone_size > capacity) {
+zone_size = capacity - start;
+}
+if (zone->d.zt != NVME_ZONE_TYPE_SEQ_WRITE) {
+return 9;
+}
+if (zone->d.zcap != ns->zone_capacity) {
+return 10;
+}
+if (zone->d.zslba != start) {
+return 11;
+}
+switch (nvme_get_zone_state(zone)) {
+case NVME_ZONE_STATE_EMPTY:
+case NVME_ZONE_STATE_OFFLINE:
+case NVME_ZONE_STATE_READ_ONLY:
+if (zone->d.wp != start) {
+return 12;
+}
+break;
+case NVME_ZONE_STATE_IMPLICITLY_OPEN:
+if (zone->d.wp < start ||
+zone->d.wp >= zone->d.zslba + zone->d.zcap) {
+return 13;
+}
+

[PATCH v5 08/14] hw/block/nvme: Define Zoned NS Command Set trace events

2020-09-27 Thread Dmitry Fomichev
The Zoned Namespace Command Set / Namespace Types implementation that
is being introduced in this series adds a good number of trace events.
Combine all tracepoint definitions into a separate patch to make
reviewing more convenient.

Signed-off-by: Dmitry Fomichev 
---
 hw/block/trace-events | 26 ++
 1 file changed, 26 insertions(+)

diff --git a/hw/block/trace-events b/hw/block/trace-events
index b93429b04c..386f28e457 100644
--- a/hw/block/trace-events
+++ b/hw/block/trace-events
@@ -93,6 +93,17 @@ pci_nvme_mmio_shutdown_cleared(void) "shutdown bit cleared"
 pci_nvme_cmd_supp_and_effects_log_read(void) "commands supported and effects 
log read"
 pci_nvme_css_nvm_cset_selected_by_host(uint32_t cc) "NVM command set selected 
by host, bar.cc=0x%"PRIx32""
 pci_nvme_css_all_csets_sel_by_host(uint32_t cc) "all supported command sets 
selected by host, bar.cc=0x%"PRIx32""
+pci_nvme_open_zone(uint64_t slba, uint32_t zone_idx, int all) "open zone, 
slba=%"PRIu64", idx=%"PRIu32", all=%"PRIi32""
+pci_nvme_close_zone(uint64_t slba, uint32_t zone_idx, int all) "close zone, 
slba=%"PRIu64", idx=%"PRIu32", all=%"PRIi32""
+pci_nvme_finish_zone(uint64_t slba, uint32_t zone_idx, int all) "finish zone, 
slba=%"PRIu64", idx=%"PRIu32", all=%"PRIi32""
+pci_nvme_reset_zone(uint64_t slba, uint32_t zone_idx, int all) "reset zone, 
slba=%"PRIu64", idx=%"PRIu32", all=%"PRIi32""
+pci_nvme_offline_zone(uint64_t slba, uint32_t zone_idx, int all) "offline 
zone, slba=%"PRIu64", idx=%"PRIu32", all=%"PRIi32""
+pci_nvme_set_descriptor_extension(uint64_t slba, uint32_t zone_idx) "set zone 
descriptor extension, slba=%"PRIu64", idx=%"PRIu32""
+pci_nvme_zd_extension_set(uint32_t zone_idx) "set descriptor extension for 
zone_idx=%"PRIu32""
+pci_nvme_power_on_close(uint32_t state, uint64_t slba) "zone state=%"PRIu32", 
slba=%"PRIu64" transitioned to Closed state"
+pci_nvme_power_on_reset(uint32_t state, uint64_t slba) "zone state=%"PRIu32", 
slba=%"PRIu64" transitioned to Empty state"
+pci_nvme_power_on_full(uint32_t state, uint64_t slba) "zone state=%"PRIu32", 
slba=%"PRIu64" transitioned to Full state"
+pci_nvme_mapped_zone_file(char *zfile_name, int ret) "mapped zone file %s, 
error %d"
 
 # nvme traces for error conditions
 pci_nvme_err_mdts(uint16_t cid, size_t len) "cid %"PRIu16" len %zu"
@@ -111,9 +122,23 @@ pci_nvme_err_invalid_prp(void) "invalid PRP"
 pci_nvme_err_invalid_opc(uint8_t opc) "invalid opcode 0x%"PRIx8""
 pci_nvme_err_invalid_admin_opc(uint8_t opc) "invalid admin opcode 0x%"PRIx8""
 pci_nvme_err_invalid_lba_range(uint64_t start, uint64_t len, uint64_t limit) 
"Invalid LBA start=%"PRIu64" len=%"PRIu64" limit=%"PRIu64""
+pci_nvme_err_unaligned_zone_cmd(uint8_t action, uint64_t slba, uint64_t zslba) 
"unaligned zone op 0x%"PRIx32", got slba=%"PRIu64", zslba=%"PRIu64""
+pci_nvme_err_invalid_zone_state_transition(uint8_t state, uint8_t action, 
uint64_t slba, uint8_t attrs) "0x%"PRIx32"->0x%"PRIx32", slba=%"PRIu64", 
attrs=0x%"PRIx32""
+pci_nvme_err_write_not_at_wp(uint64_t slba, uint64_t zone, uint64_t wp) 
"writing at slba=%"PRIu64", zone=%"PRIu64", but wp=%"PRIu64""
+pci_nvme_err_append_not_at_start(uint64_t slba, uint64_t zone) "appending at 
slba=%"PRIu64", but zone=%"PRIu64""
+pci_nvme_err_zone_write_not_ok(uint64_t slba, uint32_t nlb, uint32_t status) 
"slba=%"PRIu64", nlb=%"PRIu32", status=0x%"PRIx16""
+pci_nvme_err_zone_read_not_ok(uint64_t slba, uint32_t nlb, uint32_t status) 
"slba=%"PRIu64", nlb=%"PRIu32", status=0x%"PRIx16""
+pci_nvme_err_append_too_large(uint64_t slba, uint32_t nlb, uint8_t zasl) 
"slba=%"PRIu64", nlb=%"PRIu32", zasl=%"PRIu8""
+pci_nvme_err_insuff_active_res(uint32_t max_active) "max_active=%"PRIu32" zone 
limit exceeded"
+pci_nvme_err_insuff_open_res(uint32_t max_open) "max_open=%"PRIu32" zone limit 
exceeded"
+pci_nvme_err_zone_file_invalid(int error) "validation error=%"PRIi32""
+pci_nvme_err_zd_extension_map_error(uint32_t zone_idx) "can't map descriptor 
extension for zone_idx=%"PRIu32""
+pci_nvme_err_invalid_changed_zone_list_offset(uint64_t ofs) "changed zone list 
log offset must be 0, got %"PRIu64""
+pci_nvme_err_invalid_changed_zone_list_len(uint32_t len) "changed zone list 
log size is 4096, got %"PRIu32""
 pci_nvme_err_invalid_effects_log_offset(uint64_t ofs) "commands supported and 
effects log offset must be 0, got %"PRIu64""
 pci_nvme_err_change_css_when_enabled(void) "changing CC.CSS while controller 
is enabled"
 pci_nvme_err_only_nvm_cmd_set_avail(void) "setting 110b CC.CSS, but only NVM 
command set is enabled"
+pci_nvme_err_only_zoned_cmd_set_avail(void) "setting 001b CC.CSS, but only 
ZONED+NVM command set is enabled"
 pci_nvme_err_invalid_iocsci(uint32_t idx) "unsupported command set combination 
index %"PRIu32""
 pci_nvme_err_invalid_del_sq(uint16_t qid) "invalid submission queue deletion, 
sid=%"PRIu16""
 pci_nvme_err_invalid_create_sq_cqid(uint16_t cqid) "failed creating submission 
queue, invalid cqid=%"PRIu16""
@@ -147,6 +172,7 @@ 

[PATCH v5 10/14] hw/block/nvme: Introduce max active and open zone limits

2020-09-27 Thread Dmitry Fomichev
Added two module properties, "max_active" and "max_open" to control
the maximum number of zones that can be active or open. Once these
variables are set to non-default values, these limits are checked
during I/O and Too Many Active or Too Many Open command status is
returned if they are exceeded.

Signed-off-by: Hans Holmberg 
Signed-off-by: Dmitry Fomichev 
---
 hw/block/nvme-ns.c | 42 +++-
 hw/block/nvme-ns.h | 42 
 hw/block/nvme.c| 99 ++
 3 files changed, 181 insertions(+), 2 deletions(-)

diff --git a/hw/block/nvme-ns.c b/hw/block/nvme-ns.c
index 6d9dc9205b..63a2e3f47d 100644
--- a/hw/block/nvme-ns.c
+++ b/hw/block/nvme-ns.c
@@ -126,6 +126,28 @@ void nvme_remove_zone(NvmeNamespace *ns, NvmeZoneList *zl, 
NvmeZone *zone)
 zone->prev = zone->next = 0;
 }
 
+/*
+ * Take the first zone out from a list, return NULL if the list is empty.
+ */
+NvmeZone *nvme_remove_zone_head(NvmeNamespace *ns, NvmeZoneList *zl)
+{
+NvmeZone *zone = nvme_peek_zone_head(ns, zl);
+
+if (zone) {
+--zl->size;
+if (zl->size == 0) {
+zl->head = NVME_ZONE_LIST_NIL;
+zl->tail = NVME_ZONE_LIST_NIL;
+} else {
+zl->head = zone->next;
+ns->zone_array[zl->head].prev = NVME_ZONE_LIST_NIL;
+}
+zone->prev = zone->next = 0;
+}
+
+return zone;
+}
+
 static int nvme_calc_zone_geometry(NvmeNamespace *ns, Error **errp)
 {
 uint64_t zone_size, zone_cap;
@@ -156,6 +178,20 @@ static int nvme_calc_zone_geometry(NvmeNamespace *ns, 
Error **errp)
 ns->zone_size_log2 = 63 - clz64(ns->zone_size);
 }
 
+/* Make sure that the values of all ZNS properties are sane */
+if (ns->params.max_open_zones > nz) {
+error_setg(errp,
+   "max_open_zones value %u exceeds the number of zones %u",
+   ns->params.max_open_zones, nz);
+return -1;
+}
+if (ns->params.max_active_zones > nz) {
+error_setg(errp,
+   "max_active_zones value %u exceeds the number of zones %u",
+   ns->params.max_active_zones, nz);
+return -1;
+}
+
 return 0;
 }
 
@@ -215,8 +251,8 @@ static int nvme_zoned_init_ns(NvmeCtrl *n, NvmeNamespace 
*ns, int lba_index,
 id_ns_z = g_malloc0(sizeof(NvmeIdNsZoned));
 
 /* MAR/MOR are zeroes-based, 0x means no limit */
-id_ns_z->mar = 0x;
-id_ns_z->mor = 0x;
+id_ns_z->mar = cpu_to_le32(ns->params.max_active_zones - 1);
+id_ns_z->mor = cpu_to_le32(ns->params.max_open_zones - 1);
 id_ns_z->zoc = 0;
 id_ns_z->ozcs = ns->params.cross_zone_read ? 0x01 : 0x00;
 
@@ -312,6 +348,8 @@ static Property nvme_ns_props[] = {
params.zone_capacity_mb, 0),
 DEFINE_PROP_BOOL("cross_zone_read", NvmeNamespace,
   params.cross_zone_read, false),
+DEFINE_PROP_UINT32("max_active", NvmeNamespace, params.max_active_zones, 
0),
+DEFINE_PROP_UINT32("max_open", NvmeNamespace, params.max_open_zones, 0),
 DEFINE_PROP_END_OF_LIST(),
 };
 
diff --git a/hw/block/nvme-ns.h b/hw/block/nvme-ns.h
index daa13546c4..0664fe0892 100644
--- a/hw/block/nvme-ns.h
+++ b/hw/block/nvme-ns.h
@@ -45,6 +45,8 @@ typedef struct NvmeNamespaceParams {
 bool cross_zone_read;
 uint64_t zone_size_mb;
 uint64_t zone_capacity_mb;
+uint32_t max_active_zones;
+uint32_t max_open_zones;
 } NvmeNamespaceParams;
 
 typedef struct NvmeNamespace {
@@ -66,6 +68,8 @@ typedef struct NvmeNamespace {
 uint64_tzone_capacity;
 uint64_tzone_array_size;
 uint32_tzone_size_log2;
+int32_t nr_open_zones;
+int32_t nr_active_zones;
 
 NvmeNamespaceParams params;
 } NvmeNamespace;
@@ -189,7 +193,45 @@ static inline NvmeZone 
*nvme_next_zone_in_list(NvmeNamespace *ns, NvmeZone *z,
 return >zone_array[z->next];
 }
 
+static inline void nvme_aor_inc_open(NvmeNamespace *ns)
+{
+assert(ns->nr_open_zones >= 0);
+if (ns->params.max_open_zones) {
+ns->nr_open_zones++;
+assert(ns->nr_open_zones <= ns->params.max_open_zones);
+}
+}
+
+static inline void nvme_aor_dec_open(NvmeNamespace *ns)
+{
+if (ns->params.max_open_zones) {
+assert(ns->nr_open_zones > 0);
+ns->nr_open_zones--;
+}
+assert(ns->nr_open_zones >= 0);
+}
+
+static inline void nvme_aor_inc_active(NvmeNamespace *ns)
+{
+assert(ns->nr_active_zones >= 0);
+if (ns->params.max_active_zones) {
+ns->nr_active_zones++;
+assert(ns->nr_active_zones <= ns->params.max_active_zones);
+}
+}
+
+static inline void nvme_aor_dec_active(NvmeNamespace *ns)
+{
+if (ns->params.max_active_zones) {
+assert(ns->nr_active_zones > 0);
+ns->nr_active_zones--;
+assert(ns->nr_active_zones >= ns->nr_open_zones);
+}
+assert(ns->nr_active_zones >= 0);
+}
+
 void 

[PATCH v5 09/14] hw/block/nvme: Support Zoned Namespace Command Set

2020-09-27 Thread Dmitry Fomichev
The emulation code has been changed to advertise NVM Command Set when
"zoned" device property is not set (default) and Zoned Namespace
Command Set otherwise.

Handlers for three new NVMe commands introduced in Zoned Namespace
Command Set specification are added, namely for Zone Management
Receive, Zone Management Send and Zone Append.

Device initialization code has been extended to create a proper
configuration for zoned operation using device properties.

Read/Write command handler is modified to only allow writes at the
write pointer if the namespace is zoned. For Zone Append command,
writes implicitly happen at the write pointer and the starting write
pointer value is returned as the result of the command. Write Zeroes
handler is modified to add zoned checks that are identical to those
done as a part of Write flow.

The code to support for Zone Descriptor Extensions is not included in
this commit and ZDES 0 is always reported. A later commit in this
series will add ZDE support.

This commit doesn't yet include checks for active and open zone
limits. It is assumed that there are no limits on either active or
open zones.

Signed-off-by: Niklas Cassel 
Signed-off-by: Hans Holmberg 
Signed-off-by: Ajay Joshi 
Signed-off-by: Chaitanya Kulkarni 
Signed-off-by: Matias Bjorling 
Signed-off-by: Aravind Ramesh 
Signed-off-by: Shin'ichiro Kawasaki 
Signed-off-by: Adam Manzanares 
Signed-off-by: Dmitry Fomichev 
---
 block/nvme.c |   2 +-
 hw/block/nvme-ns.c   | 185 -
 hw/block/nvme-ns.h   |   6 +-
 hw/block/nvme.c  | 872 +--
 include/block/nvme.h |   6 +-
 5 files changed, 1033 insertions(+), 38 deletions(-)

diff --git a/block/nvme.c b/block/nvme.c
index 05485fdd11..7a513c9a17 100644
--- a/block/nvme.c
+++ b/block/nvme.c
@@ -333,7 +333,7 @@ static inline int nvme_translate_error(const NvmeCqe *c)
 {
 uint16_t status = (le16_to_cpu(c->status) >> 1) & 0xFF;
 if (status) {
-trace_nvme_error(le32_to_cpu(c->result),
+trace_nvme_error(le32_to_cpu(c->result32),
  le16_to_cpu(c->sq_head),
  le16_to_cpu(c->sq_id),
  le16_to_cpu(c->cid),
diff --git a/hw/block/nvme-ns.c b/hw/block/nvme-ns.c
index 31b7f986c3..6d9dc9205b 100644
--- a/hw/block/nvme-ns.c
+++ b/hw/block/nvme-ns.c
@@ -33,14 +33,14 @@ static void nvme_ns_init(NvmeNamespace *ns)
 NvmeIdNs *id_ns = >id_ns;
 
 if (blk_get_flags(ns->blkconf.blk) & BDRV_O_UNMAP) {
-ns->id_ns.dlfeat = 0x9;
+ns->id_ns.dlfeat = 0x8;
 }
 
 id_ns->lbaf[0].ds = BDRV_SECTOR_BITS;
 
 id_ns->nsze = cpu_to_le64(nvme_ns_nlbas(ns));
 
-ns->params.csi = NVME_CSI_NVM;
+ns->csi = NVME_CSI_NVM;
 qemu_uuid_generate(>params.uuid); /* TODO make UUIDs persistent */
 
 /* no thin provisioning */
@@ -73,7 +73,162 @@ static int nvme_ns_init_blk(NvmeCtrl *n, NvmeNamespace *ns, 
Error **errp)
 }
 
 lba_index = NVME_ID_NS_FLBAS_INDEX(ns->id_ns.flbas);
-ns->id_ns.lbaf[lba_index].ds = 31 - clz32(n->conf.logical_block_size);
+ns->id_ns.lbaf[lba_index].ds = 31 - clz32(ns->blkconf.logical_block_size);
+
+return 0;
+}
+
+/*
+ * Add a zone to the tail of a zone list.
+ */
+void nvme_add_zone_tail(NvmeNamespace *ns, NvmeZoneList *zl, NvmeZone *zone)
+{
+uint32_t idx = (uint32_t)(zone - ns->zone_array);
+
+assert(nvme_zone_not_in_list(zone));
+
+if (!zl->size) {
+zl->head = zl->tail = idx;
+zone->next = zone->prev = NVME_ZONE_LIST_NIL;
+} else {
+ns->zone_array[zl->tail].next = idx;
+zone->prev = zl->tail;
+zone->next = NVME_ZONE_LIST_NIL;
+zl->tail = idx;
+}
+zl->size++;
+}
+
+/*
+ * Remove a zone from a zone list. The zone must be linked in the list.
+ */
+void nvme_remove_zone(NvmeNamespace *ns, NvmeZoneList *zl, NvmeZone *zone)
+{
+uint32_t idx = (uint32_t)(zone - ns->zone_array);
+
+assert(!nvme_zone_not_in_list(zone));
+
+--zl->size;
+if (zl->size == 0) {
+zl->head = NVME_ZONE_LIST_NIL;
+zl->tail = NVME_ZONE_LIST_NIL;
+} else if (idx == zl->head) {
+zl->head = zone->next;
+ns->zone_array[zl->head].prev = NVME_ZONE_LIST_NIL;
+} else if (idx == zl->tail) {
+zl->tail = zone->prev;
+ns->zone_array[zl->tail].next = NVME_ZONE_LIST_NIL;
+} else {
+ns->zone_array[zone->next].prev = zone->prev;
+ns->zone_array[zone->prev].next = zone->next;
+}
+
+zone->prev = zone->next = 0;
+}
+
+static int nvme_calc_zone_geometry(NvmeNamespace *ns, Error **errp)
+{
+uint64_t zone_size, zone_cap;
+uint32_t nz, lbasz = ns->blkconf.logical_block_size;
+
+if (ns->params.zone_size_mb) {
+zone_size = ns->params.zone_size_mb;
+} else {
+zone_size = NVME_DEFAULT_ZONE_SIZE;
+}
+if (ns->params.zone_capacity_mb) {
+zone_cap = ns->params.zone_capacity_mb;
+} else {
+zone_cap = zone_size;
+}
+ 

[PATCH v5 07/14] hw/block/nvme: Make Zoned NS Command Set definitions

2020-09-27 Thread Dmitry Fomichev
Define values and structures that are needed to support Zoned
Namespace Command Set (NVMe TP 4053) in PCI NVMe controller emulator.

All new protocol definitions are located in include/block/nvme.h
and everything added that is specific to this implementation is kept
in hw/block/nvme.h.

In order to improve scalability, all open, closed and full zones
are organized in separate linked lists. Consequently, almost all
zone operations don't require scanning of the entire zone array
(which potentially can be quite large) - it is only necessary to
enumerate one or more zone lists. Zone lists are designed to be
position-independent as they can be persisted to the backing file
as a part of zone metadata. NvmeZoneList struct defined in this patch
serves as a head of every zone list.

NvmeZone structure encapsulates NvmeZoneDescriptor defined in Zoned
Command Set specification and adds a few more fields that are
internal to this implementation.

Signed-off-by: Niklas Cassel 
Signed-off-by: Hans Holmberg 
Signed-off-by: Ajay Joshi 
Signed-off-by: Matias Bjorling 
Signed-off-by: Shin'ichiro Kawasaki 
Signed-off-by: Alexey Bogoslavsky 
Signed-off-by: Dmitry Fomichev 
---
 hw/block/nvme-ns.h   | 114 +++
 hw/block/nvme.h  |  10 
 include/block/nvme.h | 107 
 3 files changed, 231 insertions(+)

diff --git a/hw/block/nvme-ns.h b/hw/block/nvme-ns.h
index acdb76f058..04172f083e 100644
--- a/hw/block/nvme-ns.h
+++ b/hw/block/nvme-ns.h
@@ -19,11 +19,33 @@
 #define NVME_NS(obj) \
 OBJECT_CHECK(NvmeNamespace, (obj), TYPE_NVME_NS)
 
+typedef struct NvmeZone {
+NvmeZoneDescr   d;
+uint64_tw_ptr;
+uint32_tnext;
+uint32_tprev;
+uint8_t rsvd80[8];
+} NvmeZone;
+
+#define NVME_ZONE_LIST_NILUINT_MAX
+
+typedef struct NvmeZoneList {
+uint32_thead;
+uint32_ttail;
+uint32_tsize;
+uint8_t rsvd12[4];
+} NvmeZoneList;
+
 typedef struct NvmeNamespaceParams {
 uint32_t nsid;
 uint8_t  csi;
 bool attached;
 QemuUUID uuid;
+
+bool zoned;
+bool cross_zone_read;
+uint64_t zone_size_mb;
+uint64_t zone_capacity_mb;
 } NvmeNamespaceParams;
 
 typedef struct NvmeNamespace {
@@ -33,6 +55,18 @@ typedef struct NvmeNamespace {
 int64_t  size;
 NvmeIdNs id_ns;
 
+NvmeIdNsZoned   *id_ns_zoned;
+NvmeZone*zone_array;
+NvmeZoneList*exp_open_zones;
+NvmeZoneList*imp_open_zones;
+NvmeZoneList*closed_zones;
+NvmeZoneList*full_zones;
+uint32_tnum_zones;
+uint64_tzone_size;
+uint64_tzone_capacity;
+uint64_tzone_array_size;
+uint32_tzone_size_log2;
+
 NvmeNamespaceParams params;
 } NvmeNamespace;
 
@@ -74,4 +108,84 @@ int nvme_ns_setup(NvmeCtrl *n, NvmeNamespace *ns, Error 
**errp);
 void nvme_ns_drain(NvmeNamespace *ns);
 void nvme_ns_flush(NvmeNamespace *ns);
 
+static inline uint8_t nvme_get_zone_state(NvmeZone *zone)
+{
+return zone->d.zs >> 4;
+}
+
+static inline void nvme_set_zone_state(NvmeZone *zone, enum NvmeZoneState 
state)
+{
+zone->d.zs = state << 4;
+}
+
+static inline uint64_t nvme_zone_rd_boundary(NvmeNamespace *ns, NvmeZone *zone)
+{
+return zone->d.zslba + ns->zone_size;
+}
+
+static inline uint64_t nvme_zone_wr_boundary(NvmeZone *zone)
+{
+return zone->d.zslba + zone->d.zcap;
+}
+
+static inline bool nvme_wp_is_valid(NvmeZone *zone)
+{
+uint8_t st = nvme_get_zone_state(zone);
+
+return st != NVME_ZONE_STATE_FULL &&
+   st != NVME_ZONE_STATE_READ_ONLY &&
+   st != NVME_ZONE_STATE_OFFLINE;
+}
+
+/*
+ * Initialize a zone list head.
+ */
+static inline void nvme_init_zone_list(NvmeZoneList *zl)
+{
+zl->head = NVME_ZONE_LIST_NIL;
+zl->tail = NVME_ZONE_LIST_NIL;
+zl->size = 0;
+}
+
+/*
+ * Initialize the number of entries contained in a zone list.
+ */
+static inline uint32_t nvme_zone_list_size(NvmeZoneList *zl)
+{
+return zl->size;
+}
+
+/*
+ * Check if the zone is not currently included into any zone list.
+ */
+static inline bool nvme_zone_not_in_list(NvmeZone *zone)
+{
+return (bool)(zone->prev == 0 && zone->next == 0);
+}
+
+/*
+ * Return the zone at the head of zone list or NULL if the list is empty.
+ */
+static inline NvmeZone *nvme_peek_zone_head(NvmeNamespace *ns, NvmeZoneList 
*zl)
+{
+if (zl->head == NVME_ZONE_LIST_NIL) {
+return NULL;
+}
+return >zone_array[zl->head];
+}
+
+/*
+ * Return the next zone in the list.
+ */
+static inline NvmeZone *nvme_next_zone_in_list(NvmeNamespace *ns, NvmeZone *z,
+   NvmeZoneList *zl)
+{
+assert(!nvme_zone_not_in_list(z));
+
+if (z->next == NVME_ZONE_LIST_NIL) {
+return NULL;
+}
+return >zone_array[z->next];
+}
+
 #endif /* NVME_NS_H */
diff --git a/hw/block/nvme.h b/hw/block/nvme.h
index 

[PATCH v5 11/14] hw/block/nvme: Support Zone Descriptor Extensions

2020-09-27 Thread Dmitry Fomichev
Zone Descriptor Extension is a label that can be assigned to a zone.
It can be set to an Empty zone and it stays assigned until the zone
is reset.

This commit adds a new optional module property, "zone_descr_ext_size".
Its value must be a multiple of 64 bytes. If this value is non-zero,
it becomes possible to assign extensions of that size to any Empty
zones. The default value for this property is 0, therefore setting
extensions is disabled by default.

Signed-off-by: Hans Holmberg 
Signed-off-by: Dmitry Fomichev 
Reviewed-by: Klaus Jensen 
---
 hw/block/nvme-ns.c | 10 -
 hw/block/nvme-ns.h |  8 
 hw/block/nvme.c| 51 --
 3 files changed, 66 insertions(+), 3 deletions(-)

diff --git a/hw/block/nvme-ns.c b/hw/block/nvme-ns.c
index 63a2e3f47d..60156dfeaf 100644
--- a/hw/block/nvme-ns.c
+++ b/hw/block/nvme-ns.c
@@ -207,6 +207,10 @@ static void nvme_init_zone_meta(NvmeNamespace *ns)
 ns->imp_open_zones = g_malloc0(sizeof(NvmeZoneList));
 ns->closed_zones = g_malloc0(sizeof(NvmeZoneList));
 ns->full_zones = g_malloc0(sizeof(NvmeZoneList));
+if (ns->params.zd_extension_size) {
+ns->zd_extensions = g_malloc0(ns->params.zd_extension_size *
+  ns->num_zones);
+}
 
 nvme_init_zone_list(ns->exp_open_zones);
 nvme_init_zone_list(ns->imp_open_zones);
@@ -257,7 +261,8 @@ static int nvme_zoned_init_ns(NvmeCtrl *n, NvmeNamespace 
*ns, int lba_index,
 id_ns_z->ozcs = ns->params.cross_zone_read ? 0x01 : 0x00;
 
 id_ns_z->lbafe[lba_index].zsze = cpu_to_le64(ns->zone_size);
-id_ns_z->lbafe[lba_index].zdes = 0; /* FIXME make helper */
+id_ns_z->lbafe[lba_index].zdes =
+ns->params.zd_extension_size >> 6; /* Units of 64B */
 
 ns->csi = NVME_CSI_ZONED;
 ns->id_ns.ncap = cpu_to_le64(ns->zone_capacity * ns->num_zones);
@@ -321,6 +326,7 @@ void nvme_ns_cleanup(NvmeNamespace *ns)
 g_free(ns->imp_open_zones);
 g_free(ns->closed_zones);
 g_free(ns->full_zones);
+g_free(ns->zd_extensions);
 }
 
 static void nvme_ns_realize(DeviceState *dev, Error **errp)
@@ -350,6 +356,8 @@ static Property nvme_ns_props[] = {
   params.cross_zone_read, false),
 DEFINE_PROP_UINT32("max_active", NvmeNamespace, params.max_active_zones, 
0),
 DEFINE_PROP_UINT32("max_open", NvmeNamespace, params.max_open_zones, 0),
+DEFINE_PROP_UINT32("zone_descr_ext_size", NvmeNamespace,
+   params.zd_extension_size, 0),
 DEFINE_PROP_END_OF_LIST(),
 };
 
diff --git a/hw/block/nvme-ns.h b/hw/block/nvme-ns.h
index 0664fe0892..ed14644e09 100644
--- a/hw/block/nvme-ns.h
+++ b/hw/block/nvme-ns.h
@@ -47,6 +47,7 @@ typedef struct NvmeNamespaceParams {
 uint64_t zone_capacity_mb;
 uint32_t max_active_zones;
 uint32_t max_open_zones;
+uint32_t zd_extension_size;
 } NvmeNamespaceParams;
 
 typedef struct NvmeNamespace {
@@ -68,6 +69,7 @@ typedef struct NvmeNamespace {
 uint64_tzone_capacity;
 uint64_tzone_array_size;
 uint32_tzone_size_log2;
+uint8_t *zd_extensions;
 int32_t nr_open_zones;
 int32_t nr_active_zones;
 
@@ -142,6 +144,12 @@ static inline bool nvme_wp_is_valid(NvmeZone *zone)
st != NVME_ZONE_STATE_OFFLINE;
 }
 
+static inline uint8_t *nvme_get_zd_extension(NvmeNamespace *ns,
+ uint32_t zone_idx)
+{
+return >zd_extensions[zone_idx * ns->params.zd_extension_size];
+}
+
 /*
  * Initialize a zone list head.
  */
diff --git a/hw/block/nvme.c b/hw/block/nvme.c
index 40947aa659..27d191c659 100644
--- a/hw/block/nvme.c
+++ b/hw/block/nvme.c
@@ -1644,6 +1644,26 @@ static bool nvme_cond_offline_all(uint8_t state)
 return state == NVME_ZONE_STATE_READ_ONLY;
 }
 
+static uint16_t nvme_set_zd_ext(NvmeNamespace *ns, NvmeZone *zone,
+uint8_t state)
+{
+uint16_t status;
+
+if (state == NVME_ZONE_STATE_EMPTY) {
+nvme_auto_transition_zone(ns, false, true);
+status = nvme_aor_check(ns, 1, 0);
+if (status != NVME_SUCCESS) {
+return status;
+}
+nvme_aor_inc_active(ns);
+zone->d.za |= NVME_ZA_ZD_EXT_VALID;
+nvme_assign_zone_state(ns, zone, NVME_ZONE_STATE_CLOSED);
+return NVME_SUCCESS;
+}
+
+return NVME_ZONE_INVAL_TRANSITION;
+}
+
 typedef uint16_t (*op_handler_t)(NvmeNamespace *, NvmeZone *,
  uint8_t);
 typedef bool (*need_to_proc_zone_t)(uint8_t);
@@ -1684,6 +1704,7 @@ static uint16_t nvme_zone_mgmt_send(NvmeCtrl *n, 
NvmeRequest *req)
 uint8_t action, state;
 bool all;
 NvmeZone *zone;
+uint8_t *zd_ext;
 
 action = dw13 & 0xff;
 all = dw13 & 0x100;
@@ -1738,7 +1759,22 @@ static uint16_t nvme_zone_mgmt_send(NvmeCtrl *n, 
NvmeRequest *req)
 
 case NVME_ZONE_ACTION_SET_ZD_EXT:
 

[PATCH v5 04/14] hw/block/nvme: Define trace events related to NS Types

2020-09-27 Thread Dmitry Fomichev
A few trace events are defined that are relevant to implementing
Namespace Types (NVMe TP 4056).

Signed-off-by: Dmitry Fomichev 
Reviewed-by: Klaus Jensen 
---
 hw/block/trace-events | 10 ++
 1 file changed, 10 insertions(+)

diff --git a/hw/block/trace-events b/hw/block/trace-events
index 2929a8df11..b93429b04c 100644
--- a/hw/block/trace-events
+++ b/hw/block/trace-events
@@ -49,8 +49,12 @@ pci_nvme_create_cq(uint64_t addr, uint16_t cqid, uint16_t 
vector, uint16_t size,
 pci_nvme_del_sq(uint16_t qid) "deleting submission queue sqid=%"PRIu16""
 pci_nvme_del_cq(uint16_t cqid) "deleted completion queue, cqid=%"PRIu16""
 pci_nvme_identify_ctrl(void) "identify controller"
+pci_nvme_identify_ctrl_csi(uint8_t csi) "identify controller, csi=0x%"PRIx8""
 pci_nvme_identify_ns(uint32_t ns) "nsid %"PRIu32""
+pci_nvme_identify_ns_csi(uint32_t ns, uint8_t csi) "nsid=%"PRIu32", 
csi=0x%"PRIx8""
 pci_nvme_identify_nslist(uint32_t ns) "nsid %"PRIu32""
+pci_nvme_identify_nslist_csi(uint16_t ns, uint8_t csi) "nsid=%"PRIu16", 
csi=0x%"PRIx8""
+pci_nvme_identify_cmd_set(void) "identify i/o command set"
 pci_nvme_identify_ns_descr_list(uint32_t ns) "nsid %"PRIu32""
 pci_nvme_get_log(uint16_t cid, uint8_t lid, uint8_t lsp, uint8_t rae, uint32_t 
len, uint64_t off) "cid %"PRIu16" lid 0x%"PRIx8" lsp 0x%"PRIx8" rae 0x%"PRIx8" 
len %"PRIu32" off %"PRIu64""
 pci_nvme_getfeat(uint16_t cid, uint8_t fid, uint8_t sel, uint32_t cdw11) "cid 
%"PRIu16" fid 0x%"PRIx8" sel 0x%"PRIx8" cdw11 0x%"PRIx32""
@@ -87,6 +91,8 @@ pci_nvme_mmio_stopped(void) "cleared controller enable bit"
 pci_nvme_mmio_shutdown_set(void) "shutdown bit set"
 pci_nvme_mmio_shutdown_cleared(void) "shutdown bit cleared"
 pci_nvme_cmd_supp_and_effects_log_read(void) "commands supported and effects 
log read"
+pci_nvme_css_nvm_cset_selected_by_host(uint32_t cc) "NVM command set selected 
by host, bar.cc=0x%"PRIx32""
+pci_nvme_css_all_csets_sel_by_host(uint32_t cc) "all supported command sets 
selected by host, bar.cc=0x%"PRIx32""
 
 # nvme traces for error conditions
 pci_nvme_err_mdts(uint16_t cid, size_t len) "cid %"PRIu16" len %zu"
@@ -106,6 +112,9 @@ pci_nvme_err_invalid_opc(uint8_t opc) "invalid opcode 
0x%"PRIx8""
 pci_nvme_err_invalid_admin_opc(uint8_t opc) "invalid admin opcode 0x%"PRIx8""
 pci_nvme_err_invalid_lba_range(uint64_t start, uint64_t len, uint64_t limit) 
"Invalid LBA start=%"PRIu64" len=%"PRIu64" limit=%"PRIu64""
 pci_nvme_err_invalid_effects_log_offset(uint64_t ofs) "commands supported and 
effects log offset must be 0, got %"PRIu64""
+pci_nvme_err_change_css_when_enabled(void) "changing CC.CSS while controller 
is enabled"
+pci_nvme_err_only_nvm_cmd_set_avail(void) "setting 110b CC.CSS, but only NVM 
command set is enabled"
+pci_nvme_err_invalid_iocsci(uint32_t idx) "unsupported command set combination 
index %"PRIu32""
 pci_nvme_err_invalid_del_sq(uint16_t qid) "invalid submission queue deletion, 
sid=%"PRIu16""
 pci_nvme_err_invalid_create_sq_cqid(uint16_t cqid) "failed creating submission 
queue, invalid cqid=%"PRIu16""
 pci_nvme_err_invalid_create_sq_sqid(uint16_t sqid) "failed creating submission 
queue, invalid sqid=%"PRIu16""
@@ -161,6 +170,7 @@ pci_nvme_ub_db_wr_invalid_cq(uint32_t qid) "completion 
queue doorbell write for
 pci_nvme_ub_db_wr_invalid_cqhead(uint32_t qid, uint16_t new_head) "completion 
queue doorbell write value beyond queue size, cqid=%"PRIu32", 
new_head=%"PRIu16", ignoring"
 pci_nvme_ub_db_wr_invalid_sq(uint32_t qid) "submission queue doorbell write 
for nonexistent queue, sqid=%"PRIu32", ignoring"
 pci_nvme_ub_db_wr_invalid_sqtail(uint32_t qid, uint16_t new_tail) "submission 
queue doorbell write value beyond queue size, sqid=%"PRIu32", 
new_head=%"PRIu16", ignoring"
+pci_nvme_ub_unknown_css_value(void) "unknown value in cc.css field"
 
 # xen-block.c
 xen_block_realize(const char *type, uint32_t disk, uint32_t partition) "%s 
d%up%u"
-- 
2.21.0




[PATCH v5 03/14] hw/block/nvme: Introduce the Namespace Types definitions

2020-09-27 Thread Dmitry Fomichev
From: Niklas Cassel 

Define the structures and constants required to implement
Namespace Types support.

Signed-off-by: Niklas Cassel 
Signed-off-by: Dmitry Fomichev 
---
 hw/block/nvme-ns.h   |  2 ++
 hw/block/nvme.c  |  2 +-
 include/block/nvme.h | 74 +++-
 3 files changed, 63 insertions(+), 15 deletions(-)

diff --git a/hw/block/nvme-ns.h b/hw/block/nvme-ns.h
index 83734f4606..cca23bc0b3 100644
--- a/hw/block/nvme-ns.h
+++ b/hw/block/nvme-ns.h
@@ -21,6 +21,8 @@
 
 typedef struct NvmeNamespaceParams {
 uint32_t nsid;
+uint8_t  csi;
+QemuUUID uuid;
 } NvmeNamespaceParams;
 
 typedef struct NvmeNamespace {
diff --git a/hw/block/nvme.c b/hw/block/nvme.c
index 1ddc7e52cc..29fa005fa2 100644
--- a/hw/block/nvme.c
+++ b/hw/block/nvme.c
@@ -1598,7 +1598,7 @@ static uint16_t nvme_identify_ns_descr_list(NvmeCtrl *n, 
NvmeRequest *req)
  * here.
  */
 ns_descrs->uuid.hdr.nidt = NVME_NIDT_UUID;
-ns_descrs->uuid.hdr.nidl = NVME_NIDT_UUID_LEN;
+ns_descrs->uuid.hdr.nidl = NVME_NIDL_UUID;
 stl_be_p(_descrs->uuid.v, nsid);
 
 return nvme_dma(n, list, NVME_IDENTIFY_DATA_SIZE,
diff --git a/include/block/nvme.h b/include/block/nvme.h
index a738c8f9ba..4587311783 100644
--- a/include/block/nvme.h
+++ b/include/block/nvme.h
@@ -51,6 +51,11 @@ enum NvmeCapMask {
 CAP_PMR_MASK   = 0x1,
 };
 
+enum NvmeCapCssBits {
+CAP_CSS_NVM= 0x01,
+CAP_CSS_CSI_SUPP   = 0x40,
+};
+
 #define NVME_CAP_MQES(cap)  (((cap) >> CAP_MQES_SHIFT)   & CAP_MQES_MASK)
 #define NVME_CAP_CQR(cap)   (((cap) >> CAP_CQR_SHIFT)& CAP_CQR_MASK)
 #define NVME_CAP_AMS(cap)   (((cap) >> CAP_AMS_SHIFT)& CAP_AMS_MASK)
@@ -102,6 +107,12 @@ enum NvmeCcMask {
 CC_IOCQES_MASK  = 0xf,
 };
 
+enum NvmeCcCss {
+CSS_NVM_ONLY= 0,
+CSS_CSI = 6,
+CSS_ADMIN_ONLY  = 7,
+};
+
 #define NVME_CC_EN(cc) ((cc >> CC_EN_SHIFT) & CC_EN_MASK)
 #define NVME_CC_CSS(cc)((cc >> CC_CSS_SHIFT)& CC_CSS_MASK)
 #define NVME_CC_MPS(cc)((cc >> CC_MPS_SHIFT)& CC_MPS_MASK)
@@ -110,6 +121,21 @@ enum NvmeCcMask {
 #define NVME_CC_IOSQES(cc) ((cc >> CC_IOSQES_SHIFT) & CC_IOSQES_MASK)
 #define NVME_CC_IOCQES(cc) ((cc >> CC_IOCQES_SHIFT) & CC_IOCQES_MASK)
 
+#define NVME_SET_CC_EN(cc, val) \
+(cc |= (uint32_t)((val) & CC_EN_MASK) << CC_EN_SHIFT)
+#define NVME_SET_CC_CSS(cc, val)\
+(cc |= (uint32_t)((val) & CC_CSS_MASK) << CC_CSS_SHIFT)
+#define NVME_SET_CC_MPS(cc, val)\
+(cc |= (uint32_t)((val) & CC_MPS_MASK) << CC_MPS_SHIFT)
+#define NVME_SET_CC_AMS(cc, val)\
+(cc |= (uint32_t)((val) & CC_AMS_MASK) << CC_AMS_SHIFT)
+#define NVME_SET_CC_SHN(cc, val)\
+(cc |= (uint32_t)((val) & CC_SHN_MASK) << CC_SHN_SHIFT)
+#define NVME_SET_CC_IOSQES(cc, val) \
+(cc |= (uint32_t)((val) & CC_IOSQES_MASK) << CC_IOSQES_SHIFT)
+#define NVME_SET_CC_IOCQES(cc, val) \
+(cc |= (uint32_t)((val) & CC_IOCQES_MASK) << CC_IOCQES_SHIFT)
+
 enum NvmeCstsShift {
 CSTS_RDY_SHIFT  = 0,
 CSTS_CFS_SHIFT  = 1,
@@ -524,8 +550,13 @@ typedef struct QEMU_PACKED NvmeIdentify {
 uint64_trsvd2[2];
 uint64_tprp1;
 uint64_tprp2;
-uint32_tcns;
-uint32_trsvd11[5];
+uint8_t cns;
+uint8_t rsvd10;
+uint16_tctrlid;
+uint16_tnvmsetid;
+uint8_t rsvd11;
+uint8_t csi;
+uint32_trsvd12[4];
 } NvmeIdentify;
 
 typedef struct QEMU_PACKED NvmeRwCmd {
@@ -645,6 +676,7 @@ enum NvmeStatusCodes {
 NVME_MD_SGL_LEN_INVALID = 0x0010,
 NVME_SGL_DESCR_TYPE_INVALID = 0x0011,
 NVME_INVALID_USE_OF_CMB = 0x0012,
+NVME_CMD_SET_CMB_REJECTED   = 0x002b,
 NVME_LBA_RANGE  = 0x0080,
 NVME_CAP_EXCEEDED   = 0x0081,
 NVME_NS_NOT_READY   = 0x0082,
@@ -771,11 +803,15 @@ typedef struct QEMU_PACKED NvmePSD {
 
 #define NVME_IDENTIFY_DATA_SIZE 4096
 
-enum {
-NVME_ID_CNS_NS = 0x0,
-NVME_ID_CNS_CTRL   = 0x1,
-NVME_ID_CNS_NS_ACTIVE_LIST = 0x2,
-NVME_ID_CNS_NS_DESCR_LIST  = 0x3,
+enum NvmeIdCns {
+NVME_ID_CNS_NS= 0x00,
+NVME_ID_CNS_CTRL  = 0x01,
+NVME_ID_CNS_NS_ACTIVE_LIST= 0x02,
+NVME_ID_CNS_NS_DESCR_LIST = 0x03,
+NVME_ID_CNS_CS_NS = 0x05,
+NVME_ID_CNS_CS_CTRL   = 0x06,
+NVME_ID_CNS_CS_NS_ACTIVE_LIST = 0x07,
+NVME_ID_CNS_IO_COMMAND_SET= 0x1c,
 };
 
 typedef struct QEMU_PACKED NvmeIdCtrl {
@@ -922,6 +958,7 @@ enum NvmeFeatureIds {
 NVME_WRITE_ATOMICITY= 0xa,
 NVME_ASYNCHRONOUS_EVENT_CONF= 0xb,
 NVME_TIMESTAMP  = 0xe,
+NVME_COMMAND_SET_PROFILE= 0x19,
 NVME_SOFTWARE_PROGRESS_MARKER   = 0x80,
 NVME_FID_MAX= 0x100,
 };
@@ -1006,18 +1043,26 @@ typedef struct QEMU_PACKED NvmeIdNsDescr {
 uint8_t rsvd2[2];
 } NvmeIdNsDescr;
 
-enum {
-NVME_NIDT_EUI64_LEN =  8,
-NVME_NIDT_NGUID_LEN = 16,

[PATCH v5 05/14] hw/block/nvme: Add support for Namespace Types

2020-09-27 Thread Dmitry Fomichev
From: Niklas Cassel 

Namespace Types introduce a new command set, "I/O Command Sets",
that allows the host to retrieve the command sets associated with
a namespace. Introduce support for the command set and enable
detection for the NVM Command Set.

The new workflows for identify commands rely heavily on zero-filled
identify structs. E.g., certain CNS commands are defined to return
a zero-filled identify struct when an inactive namespace NSID
is supplied.

Add a helper function in order to avoid code duplication when
reporting zero-filled identify structures.

Signed-off-by: Niklas Cassel 
Signed-off-by: Dmitry Fomichev 
---
 hw/block/nvme-ns.c |   3 +
 hw/block/nvme.c| 210 +
 2 files changed, 175 insertions(+), 38 deletions(-)

diff --git a/hw/block/nvme-ns.c b/hw/block/nvme-ns.c
index bbd7879492..31b7f986c3 100644
--- a/hw/block/nvme-ns.c
+++ b/hw/block/nvme-ns.c
@@ -40,6 +40,9 @@ static void nvme_ns_init(NvmeNamespace *ns)
 
 id_ns->nsze = cpu_to_le64(nvme_ns_nlbas(ns));
 
+ns->params.csi = NVME_CSI_NVM;
+qemu_uuid_generate(>params.uuid); /* TODO make UUIDs persistent */
+
 /* no thin provisioning */
 id_ns->ncap = id_ns->nsze;
 id_ns->nuse = id_ns->ncap;
diff --git a/hw/block/nvme.c b/hw/block/nvme.c
index 29fa005fa2..4ec1ddc90a 100644
--- a/hw/block/nvme.c
+++ b/hw/block/nvme.c
@@ -1495,6 +1495,13 @@ static uint16_t nvme_create_cq(NvmeCtrl *n, NvmeRequest 
*req)
 return NVME_SUCCESS;
 }
 
+static uint16_t nvme_rpt_empty_id_struct(NvmeCtrl *n, NvmeRequest *req)
+{
+uint8_t id[NVME_IDENTIFY_DATA_SIZE] = {};
+
+return nvme_dma(n, id, sizeof(id), DMA_DIRECTION_FROM_DEVICE, req);
+}
+
 static uint16_t nvme_identify_ctrl(NvmeCtrl *n, NvmeRequest *req)
 {
 trace_pci_nvme_identify_ctrl();
@@ -1503,11 +1510,23 @@ static uint16_t nvme_identify_ctrl(NvmeCtrl *n, 
NvmeRequest *req)
 DMA_DIRECTION_FROM_DEVICE, req);
 }
 
+static uint16_t nvme_identify_ctrl_csi(NvmeCtrl *n, NvmeRequest *req)
+{
+NvmeIdentify *c = (NvmeIdentify *)>cmd;
+
+trace_pci_nvme_identify_ctrl_csi(c->csi);
+
+if (c->csi == NVME_CSI_NVM) {
+return nvme_rpt_empty_id_struct(n, req);
+}
+
+return NVME_INVALID_FIELD | NVME_DNR;
+}
+
 static uint16_t nvme_identify_ns(NvmeCtrl *n, NvmeRequest *req)
 {
 NvmeNamespace *ns;
 NvmeIdentify *c = (NvmeIdentify *)>cmd;
-NvmeIdNs *id_ns, inactive = { 0 };
 uint32_t nsid = le32_to_cpu(c->nsid);
 
 trace_pci_nvme_identify_ns(nsid);
@@ -1518,23 +1537,46 @@ static uint16_t nvme_identify_ns(NvmeCtrl *n, 
NvmeRequest *req)
 
 ns = nvme_ns(n, nsid);
 if (unlikely(!ns)) {
-id_ns = 
-} else {
-id_ns = >id_ns;
+return nvme_rpt_empty_id_struct(n, req);
 }
 
-return nvme_dma(n, (uint8_t *)id_ns, sizeof(NvmeIdNs),
+return nvme_dma(n, (uint8_t *)>id_ns, sizeof(NvmeIdNs),
 DMA_DIRECTION_FROM_DEVICE, req);
 }
 
+static uint16_t nvme_identify_ns_csi(NvmeCtrl *n, NvmeRequest *req)
+{
+NvmeNamespace *ns;
+NvmeIdentify *c = (NvmeIdentify *)>cmd;
+uint32_t nsid = le32_to_cpu(c->nsid);
+
+trace_pci_nvme_identify_ns_csi(nsid, c->csi);
+
+if (!nvme_nsid_valid(n, nsid) || nsid == NVME_NSID_BROADCAST) {
+return NVME_INVALID_NSID | NVME_DNR;
+}
+
+ns = nvme_ns(n, nsid);
+if (unlikely(!ns)) {
+return nvme_rpt_empty_id_struct(n, req);
+}
+
+if (c->csi == NVME_CSI_NVM) {
+return nvme_rpt_empty_id_struct(n, req);
+}
+
+return NVME_INVALID_FIELD | NVME_DNR;
+}
+
 static uint16_t nvme_identify_nslist(NvmeCtrl *n, NvmeRequest *req)
 {
+NvmeNamespace *ns;
 NvmeIdentify *c = (NvmeIdentify *)>cmd;
-static const int data_len = NVME_IDENTIFY_DATA_SIZE;
 uint32_t min_nsid = le32_to_cpu(c->nsid);
-uint32_t *list;
-uint16_t ret;
-int j = 0;
+uint8_t list[NVME_IDENTIFY_DATA_SIZE] = {};
+static const int data_len = sizeof(list);
+uint32_t *list_ptr = (uint32_t *)list;
+int i, j = 0;
 
 trace_pci_nvme_identify_nslist(min_nsid);
 
@@ -1548,48 +1590,76 @@ static uint16_t nvme_identify_nslist(NvmeCtrl *n, 
NvmeRequest *req)
 return NVME_INVALID_NSID | NVME_DNR;
 }
 
-list = g_malloc0(data_len);
-for (int i = 1; i <= n->num_namespaces; i++) {
-if (i <= min_nsid || !nvme_ns(n, i)) {
+for (i = 1; i <= n->num_namespaces; i++) {
+ns = nvme_ns(n, i);
+if (!ns) {
 continue;
 }
-list[j++] = cpu_to_le32(i);
+if (ns->params.nsid < min_nsid) {
+continue;
+}
+list_ptr[j++] = cpu_to_le32(ns->params.nsid);
 if (j == data_len / sizeof(uint32_t)) {
 break;
 }
 }
-ret = nvme_dma(n, (uint8_t *)list, data_len, DMA_DIRECTION_FROM_DEVICE,
-   req);
-g_free(list);
-return ret;
+
+return nvme_dma(n, list, data_len, DMA_DIRECTION_FROM_DEVICE, req);
+}
+
+static 

[PATCH v5 06/14] hw/block/nvme: Add support for active/inactive namespaces

2020-09-27 Thread Dmitry Fomichev
From: Niklas Cassel 

In NVMe, a namespace is active if it exists and is attached to the
controller.

CAP.CSS (together with the I/O Command Set data structure) defines what
command sets are supported by the controller.

CC.CSS (together with Set Profile) can be set to enable a subset of the
available command sets. The namespaces belonging to a disabled command set
will not be able to attach to the controller, and will thus be inactive.

E.g., if the user sets CC.CSS to Admin Only, NVM namespaces should be
marked as inactive.

The identify namespace, the identify namespace CSI specific, and the namespace
list commands have two different versions, one that only shows active
namespaces, and the other version that shows existing namespaces, regardless
of whether the namespace is attached or not.

Add an attached member to struct NvmeNamespace, and implement the missing CNS
commands.

The added functionality will also simplify the implementation of namespace
management in the future, since namespace management can also attach and
detach namespaces.

Signed-off-by: Niklas Cassel 
Signed-off-by: Dmitry Fomichev 
---
 hw/block/nvme-ns.h   |  1 +
 hw/block/nvme.c  | 60 ++--
 include/block/nvme.h | 20 +--
 3 files changed, 65 insertions(+), 16 deletions(-)

diff --git a/hw/block/nvme-ns.h b/hw/block/nvme-ns.h
index cca23bc0b3..acdb76f058 100644
--- a/hw/block/nvme-ns.h
+++ b/hw/block/nvme-ns.h
@@ -22,6 +22,7 @@
 typedef struct NvmeNamespaceParams {
 uint32_t nsid;
 uint8_t  csi;
+bool attached;
 QemuUUID uuid;
 } NvmeNamespaceParams;
 
diff --git a/hw/block/nvme.c b/hw/block/nvme.c
index 4ec1ddc90a..63ad03d6d6 100644
--- a/hw/block/nvme.c
+++ b/hw/block/nvme.c
@@ -1523,7 +1523,8 @@ static uint16_t nvme_identify_ctrl_csi(NvmeCtrl *n, 
NvmeRequest *req)
 return NVME_INVALID_FIELD | NVME_DNR;
 }
 
-static uint16_t nvme_identify_ns(NvmeCtrl *n, NvmeRequest *req)
+static uint16_t nvme_identify_ns(NvmeCtrl *n, NvmeRequest *req,
+ bool only_active)
 {
 NvmeNamespace *ns;
 NvmeIdentify *c = (NvmeIdentify *)>cmd;
@@ -1540,11 +1541,16 @@ static uint16_t nvme_identify_ns(NvmeCtrl *n, 
NvmeRequest *req)
 return nvme_rpt_empty_id_struct(n, req);
 }
 
+if (only_active && !ns->params.attached) {
+return nvme_rpt_empty_id_struct(n, req);
+}
+
 return nvme_dma(n, (uint8_t *)>id_ns, sizeof(NvmeIdNs),
 DMA_DIRECTION_FROM_DEVICE, req);
 }
 
-static uint16_t nvme_identify_ns_csi(NvmeCtrl *n, NvmeRequest *req)
+static uint16_t nvme_identify_ns_csi(NvmeCtrl *n, NvmeRequest *req,
+ bool only_active)
 {
 NvmeNamespace *ns;
 NvmeIdentify *c = (NvmeIdentify *)>cmd;
@@ -1561,6 +1567,10 @@ static uint16_t nvme_identify_ns_csi(NvmeCtrl *n, 
NvmeRequest *req)
 return nvme_rpt_empty_id_struct(n, req);
 }
 
+if (only_active && !ns->params.attached) {
+return nvme_rpt_empty_id_struct(n, req);
+}
+
 if (c->csi == NVME_CSI_NVM) {
 return nvme_rpt_empty_id_struct(n, req);
 }
@@ -1568,7 +1578,8 @@ static uint16_t nvme_identify_ns_csi(NvmeCtrl *n, 
NvmeRequest *req)
 return NVME_INVALID_FIELD | NVME_DNR;
 }
 
-static uint16_t nvme_identify_nslist(NvmeCtrl *n, NvmeRequest *req)
+static uint16_t nvme_identify_nslist(NvmeCtrl *n, NvmeRequest *req,
+ bool only_active)
 {
 NvmeNamespace *ns;
 NvmeIdentify *c = (NvmeIdentify *)>cmd;
@@ -1598,6 +1609,9 @@ static uint16_t nvme_identify_nslist(NvmeCtrl *n, 
NvmeRequest *req)
 if (ns->params.nsid < min_nsid) {
 continue;
 }
+if (only_active && !ns->params.attached) {
+continue;
+}
 list_ptr[j++] = cpu_to_le32(ns->params.nsid);
 if (j == data_len / sizeof(uint32_t)) {
 break;
@@ -1607,7 +1621,8 @@ static uint16_t nvme_identify_nslist(NvmeCtrl *n, 
NvmeRequest *req)
 return nvme_dma(n, list, data_len, DMA_DIRECTION_FROM_DEVICE, req);
 }
 
-static uint16_t nvme_identify_nslist_csi(NvmeCtrl *n, NvmeRequest *req)
+static uint16_t nvme_identify_nslist_csi(NvmeCtrl *n, NvmeRequest *req,
+ bool only_active)
 {
 NvmeNamespace *ns;
 NvmeIdentify *c = (NvmeIdentify *)>cmd;
@@ -1631,6 +1646,9 @@ static uint16_t nvme_identify_nslist_csi(NvmeCtrl *n, 
NvmeRequest *req)
 if (ns->params.nsid < min_nsid) {
 continue;
 }
+if (only_active && !ns->params.attached) {
+continue;
+}
 list_ptr[j++] = cpu_to_le32(ns->params.nsid);
 if (j == data_len / sizeof(uint32_t)) {
 break;
@@ -1700,17 +1718,25 @@ static uint16_t nvme_identify(NvmeCtrl *n, NvmeRequest 
*req)
 
 switch (le32_to_cpu(c->cns)) {
 case NVME_ID_CNS_NS:
-return nvme_identify_ns(n, req);
+return nvme_identify_ns(n, req, true);
 

[PATCH v5 02/14] hw/block/nvme: Add Commands Supported and Effects log

2020-09-27 Thread Dmitry Fomichev
This log page becomes necessary to implement to allow checking for
Zone Append command support in Zoned Namespace Command Set.

This commit adds the code to report this log page for NVM Command
Set only. The parts that are specific to zoned operation will be
added later in the series.

Signed-off-by: Dmitry Fomichev 
---
 hw/block/nvme.c   | 41 -
 hw/block/trace-events |  2 ++
 include/block/nvme.h  | 19 +++
 3 files changed, 61 insertions(+), 1 deletion(-)

diff --git a/hw/block/nvme.c b/hw/block/nvme.c
index da8344f196..1ddc7e52cc 100644
--- a/hw/block/nvme.c
+++ b/hw/block/nvme.c
@@ -1301,6 +1301,43 @@ static uint16_t nvme_error_info(NvmeCtrl *n, uint8_t 
rae, uint32_t buf_len,
 DMA_DIRECTION_FROM_DEVICE, req);
 }
 
+static uint16_t nvme_cmd_effects(NvmeCtrl *n, uint32_t buf_len,
+ uint64_t off, NvmeRequest *req)
+{
+NvmeEffectsLog cmd_eff_log = {};
+uint32_t *iocs = cmd_eff_log.iocs;
+uint32_t *acs = cmd_eff_log.acs;
+uint32_t trans_len;
+
+trace_pci_nvme_cmd_supp_and_effects_log_read();
+
+if (off >= sizeof(cmd_eff_log)) {
+trace_pci_nvme_err_invalid_effects_log_offset(off);
+return NVME_INVALID_FIELD | NVME_DNR;
+}
+
+acs[NVME_ADM_CMD_DELETE_SQ] = NVME_CMD_EFFECTS_CSUPP;
+acs[NVME_ADM_CMD_CREATE_SQ] = NVME_CMD_EFFECTS_CSUPP;
+acs[NVME_ADM_CMD_DELETE_CQ] = NVME_CMD_EFFECTS_CSUPP;
+acs[NVME_ADM_CMD_CREATE_CQ] = NVME_CMD_EFFECTS_CSUPP;
+acs[NVME_ADM_CMD_IDENTIFY] = NVME_CMD_EFFECTS_CSUPP;
+acs[NVME_ADM_CMD_SET_FEATURES] = NVME_CMD_EFFECTS_CSUPP;
+acs[NVME_ADM_CMD_GET_FEATURES] = NVME_CMD_EFFECTS_CSUPP;
+acs[NVME_ADM_CMD_GET_LOG_PAGE] = NVME_CMD_EFFECTS_CSUPP;
+acs[NVME_ADM_CMD_ASYNC_EV_REQ] = NVME_CMD_EFFECTS_CSUPP;
+
+iocs[NVME_CMD_FLUSH] = NVME_CMD_EFFECTS_CSUPP | NVME_CMD_EFFECTS_LBCC;
+iocs[NVME_CMD_WRITE_ZEROES] = NVME_CMD_EFFECTS_CSUPP |
+  NVME_CMD_EFFECTS_LBCC;
+iocs[NVME_CMD_WRITE] = NVME_CMD_EFFECTS_CSUPP | NVME_CMD_EFFECTS_LBCC;
+iocs[NVME_CMD_READ] = NVME_CMD_EFFECTS_CSUPP;
+
+trans_len = MIN(sizeof(cmd_eff_log) - off, buf_len);
+
+return nvme_dma(n, ((uint8_t *)_eff_log) + off, trans_len,
+DMA_DIRECTION_FROM_DEVICE, req);
+}
+
 static uint16_t nvme_get_log(NvmeCtrl *n, NvmeRequest *req)
 {
 NvmeCmd *cmd = >cmd;
@@ -1344,6 +1381,8 @@ static uint16_t nvme_get_log(NvmeCtrl *n, NvmeRequest 
*req)
 return nvme_smart_info(n, rae, len, off, req);
 case NVME_LOG_FW_SLOT_INFO:
 return nvme_fw_log_info(n, len, off, req);
+case NVME_LOG_CMD_EFFECTS:
+return nvme_cmd_effects(n, len, off, req);
 default:
 trace_pci_nvme_err_invalid_log_page(nvme_cid(req), lid);
 return NVME_INVALID_FIELD | NVME_DNR;
@@ -2743,7 +2782,7 @@ static void nvme_init_ctrl(NvmeCtrl *n, PCIDevice 
*pci_dev)
 id->acl = 3;
 id->aerl = n->params.aerl;
 id->frmw = (NVME_NUM_FW_SLOTS << 1) | NVME_FRMW_SLOT1_RO;
-id->lpa = NVME_LPA_EXTENDED;
+id->lpa = NVME_LPA_CSE | NVME_LPA_EXTENDED;
 
 /* recommended default value (~70 C) */
 id->wctemp = cpu_to_le16(NVME_TEMPERATURE_WARNING);
diff --git a/hw/block/trace-events b/hw/block/trace-events
index bbe6f27367..2929a8df11 100644
--- a/hw/block/trace-events
+++ b/hw/block/trace-events
@@ -86,6 +86,7 @@ pci_nvme_mmio_start_success(void) "setting controller enable 
bit succeeded"
 pci_nvme_mmio_stopped(void) "cleared controller enable bit"
 pci_nvme_mmio_shutdown_set(void) "shutdown bit set"
 pci_nvme_mmio_shutdown_cleared(void) "shutdown bit cleared"
+pci_nvme_cmd_supp_and_effects_log_read(void) "commands supported and effects 
log read"
 
 # nvme traces for error conditions
 pci_nvme_err_mdts(uint16_t cid, size_t len) "cid %"PRIu16" len %zu"
@@ -104,6 +105,7 @@ pci_nvme_err_invalid_prp(void) "invalid PRP"
 pci_nvme_err_invalid_opc(uint8_t opc) "invalid opcode 0x%"PRIx8""
 pci_nvme_err_invalid_admin_opc(uint8_t opc) "invalid admin opcode 0x%"PRIx8""
 pci_nvme_err_invalid_lba_range(uint64_t start, uint64_t len, uint64_t limit) 
"Invalid LBA start=%"PRIu64" len=%"PRIu64" limit=%"PRIu64""
+pci_nvme_err_invalid_effects_log_offset(uint64_t ofs) "commands supported and 
effects log offset must be 0, got %"PRIu64""
 pci_nvme_err_invalid_del_sq(uint16_t qid) "invalid submission queue deletion, 
sid=%"PRIu16""
 pci_nvme_err_invalid_create_sq_cqid(uint16_t cqid) "failed creating submission 
queue, invalid cqid=%"PRIu16""
 pci_nvme_err_invalid_create_sq_sqid(uint16_t sqid) "failed creating submission 
queue, invalid sqid=%"PRIu16""
diff --git a/include/block/nvme.h b/include/block/nvme.h
index 58647bcdad..a738c8f9ba 100644
--- a/include/block/nvme.h
+++ b/include/block/nvme.h
@@ -734,10 +734,27 @@ enum NvmeSmartWarn {
 NVME_SMART_FAILED_VOLATILE_MEDIA  = 1 << 4,
 };
 
+typedef struct NvmeEffectsLog {
+uint32_tacs[256];
+uint32_tiocs[256];

[PATCH v5 00/14] hw/block/nvme: Support Namespace Types and Zoned Namespace Command Set

2020-09-27 Thread Dmitry Fomichev
v4 -> v5

 - Rebase to the current qemu-nvme.

 - Use HostMemoryBackendFile as the backing storage for persistent
   zone metadata.

 - Fix the issue with filling the valid data in the next zone if RAZBi
   is enabled.

v3 -> v4

 - Fix bugs introduced in v2/v3 for QD > 1 operation. Now, all writes
   to a zone happen at the new write pointer variable, zone->w_ptr,
   that is advanced right after submitting the backend i/o. The existing
   zone->d.wp variable is updated upon the successful write completion
   and it is used for zone reporting. Some code has been split from
   nvme_finalize_zoned_write() function to a new function,
   nvme_advance_zone_wp().

 - Make the code compile under mingw. Switch to using QEMU API for
   mmap/msync, i.e. memory_region...(). Since mmap is not available in
   mingw (even though there is mman-win32 library available on Github),
   conditional compilation is added around these calls to avoid
   undefined symbols under mingw. A better fix would be to add stub
   functions to softmmu/memory.c for the case when CONFIG_POSIX is not
   defined, but such change is beyond the scope of this patchset and it
   can be made in a separate patch.

 - Correct permission mask used to open zone metadata file.

 - Fold "Define 64 bit cqe.result" patch into ZNS commit.

 - Use clz64/clz32 instead of defining nvme_ilog2() function.

 - Simplify rpt_empty_id_struct() code, move nvme_fill_data() back
   to ZNS patch.

 - Fix a power-on processing bug.

 - Rename NVME_CMD_ZONE_APND to NVME_CMD_ZONE_APPEND.

 - Make the list of review comments addressed in v2 of the series
   (see below).

v2 -> v3:

 - Moved nvme_fill_data() function to the NSTypes patch as it is
   now used there to output empty namespace identify structs.
 - Fixed typo in Maxim's email address.

v1 -> v2:

 - Rebased on top of qemu-nvme/next branch.
 - Incorporated feedback from Klaus and Alistair.
* Allow a subset of CSE log to be read, not the entire log
* Assign admin command entries in CSE log to ACS fields
* Set LPA bit 1 to indicate support of CSE log page
* Rename CC.CSS value CSS_ALL_NSTYPES (110b) to CSS_CSI
* Move the code to assign lbaf.ds to a separate patch
* Remove the change in firmware revision
* Change "driver" to "device" in comments and annotations
* Rename ZAMDS to ZASL
* Correct a few format expressions and some wording in
  trace event definitions
* Remove validation code to return NVME_CAP_EXCEEDED error
* Make ZASL to be equal to MDTS if "zone_append_size_limit"
  module parameter is not set
* Clean up nvme_zoned_init_ctrl() to make size calculations
  less confusing
* Avoid changing module parameters, use separate n/s variables
  if additional calculations are necessary to convert parameters
  to running values
* Use NVME_DEFAULT_ZONE_SIZE to assign the default zone size value
* Use default 0 for zone capacity meaning that zone capacity will
  be equal to zone size by default
* Issue warnings if user MAR/MOR values are too large and have
  to be adjusted
* Use unsigned values for MAR/MOR
 - Dropped "Simulate Zone Active excursions" patch.
   Excursion behavior may depend on the internal controller
   architecture and therefore be vendor-specific.
 - Dropped support for Zone Attributes and zoned AENs for now.
   These features can be added in a future series.
 - NS Types support is extended to handle active/inactive namespaces.
 - Update the write pointer after backing storage I/O completion, not
   before. This makes the emulation to run correctly in case of
   backing device failures.
 - Avoid division in the I/O path if the device zone size is
   a power of two (the most common case). Zone index then can be
   calculated by using bit shift.
 - A few reported bugs have been fixed.
 - Indentation in function definitions has been changed to make it
   the same as the rest of the code.


Zoned Namespace (ZNS) Command Set is a newly introduced command set
published by the NVM Express, Inc. organization as TP 4053. The main
design goals of ZNS are to provide hardware designers the means to
reduce NVMe controller complexity and to allow achieving a better I/O
latency and throughput. SSDs that implement this interface are
commonly known as ZNS SSDs.

This command set is implementing a zoned storage model, similarly to
ZAC/ZBC. As such, there is already support in Linux, allowing one to
perform the majority of tasks needed for managing ZNS SSDs.

The Zoned Namespace Command Set relies on another TP, known as
Namespace Types (NVMe TP 4056), which introduces support for having
multiple command sets per namespace.

Both ZNS and Namespace Types specifications can be downloaded by
visiting the following link -

https://nvmexpress.org/wp-content/uploads/NVM-Express-1.4-Ratified-TPs.zip

This patch series adds Namespace Types support and zoned namespace
emulation capability to the existing NVMe PCI device.

The 

[PATCH v5 01/14] hw/block/nvme: Report actual LBA data shift in LBAF

2020-09-27 Thread Dmitry Fomichev
Calculate the data shift value to report based on the set value of
logical_block_size device property.

In the process, use a local variable to calculate the LBA format
index instead of the hardcoded value 0. This makes the code more
readable and it will make it easier to add support for multiple LBA
formats in the future.

Signed-off-by: Dmitry Fomichev 
Reviewed-by: Klaus Jensen 
---
 hw/block/nvme-ns.c | 5 +
 1 file changed, 5 insertions(+)

diff --git a/hw/block/nvme-ns.c b/hw/block/nvme-ns.c
index 2ba0263dda..bbd7879492 100644
--- a/hw/block/nvme-ns.c
+++ b/hw/block/nvme-ns.c
@@ -47,6 +47,8 @@ static void nvme_ns_init(NvmeNamespace *ns)
 
 static int nvme_ns_init_blk(NvmeCtrl *n, NvmeNamespace *ns, Error **errp)
 {
+int lba_index;
+
 if (!blkconf_blocksizes(>blkconf, errp)) {
 return -1;
 }
@@ -67,6 +69,9 @@ static int nvme_ns_init_blk(NvmeCtrl *n, NvmeNamespace *ns, 
Error **errp)
 n->features.vwc = 0x1;
 }
 
+lba_index = NVME_ID_NS_FLBAS_INDEX(ns->id_ns.flbas);
+ns->id_ns.lbaf[lba_index].ds = 31 - clz32(n->conf.logical_block_size);
+
 return 0;
 }
 
-- 
2.21.0




RE: [PATCH v4 00/14] hw/block/nvme: Support Namespace Types and Zoned Namespace Command Set

2020-09-27 Thread Dmitry Fomichev
> -Original Message-
> From: Klaus Jensen 
> Sent: Thursday, September 24, 2020 5:08 PM
> To: Dmitry Fomichev 
> Cc: Keith Busch ; Klaus Jensen
> ; Kevin Wolf ; Philippe
> Mathieu-Daudé ; Maxim Levitsky
> ; Fam Zheng ; Niklas Cassel
> ; Damien Le Moal ;
> qemu-block@nongnu.org; qemu-de...@nongnu.org; Alistair Francis
> ; Matias Bjorling 
> Subject: Re: [PATCH v4 00/14] hw/block/nvme: Support Namespace Types
> and Zoned Namespace Command Set
> 
> On Sep 24 03:20, Dmitry Fomichev wrote:
> > v3 -> v4
> >
> >  - Fix bugs introduced in v2/v3 for QD > 1 operation. Now, all writes
> >to a zone happen at the new write pointer variable, zone->w_ptr,
> >that is advanced right after submitting the backend i/o. The existing
> >zone->d.wp variable is updated upon the successful write completion
> >and it is used for zone reporting. Some code has been split from
> >nvme_finalize_zoned_write() function to a new function,
> >nvme_advance_zone_wp().
> >
> 
> Same approach that I've used, +1.
> 
> >  - Make the code compile under mingw. Switch to using QEMU API for
> >mmap/msync, i.e. memory_region...(). Since mmap is not available in
> >mingw (even though there is mman-win32 library available on Github),
> >conditional compilation is added around these calls to avoid
> >undefined symbols under mingw. A better fix would be to add stub
> >functions to softmmu/memory.c for the case when CONFIG_POSIX is not
> >defined, but such change is beyond the scope of this patchset and it
> >can be made in a separate patch.
> >
> 
> E.
> 
> This feels like a hack or at the very least an abuse of the physical
> memory management API.
> 
> If it really needs to be memory mapped, then I think a hostmem-based
> approach similar to what Andrzej did for PMR is needed (I think that
> will get rid of the CONFIG_POSIX ifdef at least, but still leave it
> slightly tricky to get it to work on all platforms AFAIK).

Ok, it looks that using the HostMemoryBackendFile backend will be
more appropriate. This will remove the need for conditional compile.

The mmap() portability is pretty decent across software platforms.
Any poor Windows user who is forced to emulate ZNS on mingw will be
able to do so, just without having zone state persistency. Considering
how specialized this stuff is in first place, I estimate the number of users
affected by this "limitation" to be exactly zero.

> But really,
> since we do not require memory semantics for this, then I think the
> abstraction is fundamentally wrong.
> 

Seriously, what is wrong with using mmap :) ? It is used successfully for
similar applications, for example -
https://github.com/open-iscsi/tcmu-runner/blob/master/file_zbc.c

> I am, of course, blowing my own horn, since my implementation uses a
> portable blockdev for this.
> 

You are making it sound like the entire WDC series relies on this approach.
Actually, the persistency is introduced in the second to last patch in the
series and it only adds a couple of lines of code in the i/o path to mark
zones dirty. This is possible because of using mmap() and I find the way
it is done to be quite elegant, not ugly :)

> Another issue is the complete lack of endian conversions. Does it
> matter? It depends. Will anyone ever use this on a big endian host and
> move the meta data backing file to a little endian host? Probably not.
> So does it really matter? Probably not, but it is cutting corners.
> 

Great point on endianness! Naturally, all file backed values are stored in
their native endianness. This way, there is no extra overhead on big endian
hardware architectures. Portability concerns can be easily addressed by
storing metadata endianness as a byte flag in its header. Then, during
initialization, the metadata validation code can detect the possible
discrepancy in endianness and automatically convert the metadata to the
endianness of the host. This part is out of scope of this series, but I would
be able to contribute such a solution as an enhancement in the future.

> >  - Make the list of review comments addressed in v2 of the series
> >(see below).
> >
> 
> Very detailed! Thanks!


Re: [PATCH v5 0/7] vhost-user-blk: fix the migration issue and enhance qtests

2020-09-27 Thread Dima Stepanov
On Thu, Sep 24, 2020 at 07:26:14AM -0400, Michael S. Tsirkin wrote:
> On Fri, Sep 11, 2020 at 11:39:42AM +0300, Dima Stepanov wrote:
> > v4 -> v5:
> >   - vhost: check queue state in the vhost_dev_set_log routine
> > tests/qtest/vhost-user-test: prepare the tests for adding new
> > dev class
> > tests/qtest/vhost-user-test: add support for the
> > vhost-user-blk device
> > tests/qtest/vhost-user-test: add migrate_reconnect test
> > Reviewed-by: Raphael Norwitz
> >   - Update qtest, by merging vhost-user-blk "if" case with the
> > virtio-blk case.
> 
> I dropped patches 3-7 since they were stalling on some systems.
> Pls work with Peter Maydell (cc'd) to figure it out.
Thanks!

Peter, can you share any details for the stalling errors with me?

> 
> 
> > v3 -> v4:
> >   - vhost: recheck dev state in the vhost_migration_log routine
> > Reviewed-by: Raphael Norwitz
> >   - vhost: check queue state in the vhost_dev_set_log routine
> > Use "continue" instead of "break" to handle non-initialized
> > virtqueue case.
> > 
> > v2 -> v3:
> >   - update commit message for the 
> > "vhost: recheck dev state in the vhost_migration_log routine" commit
> >   - rename "started" field of the VhostUserBlk structure to
> > "started_vu", so there will be no confustion with the VHOST started
> > field
> >   - update vhost-user-test.c to always initialize nq local variable
> > (spotted by patchew)
> > 
> > v1 -> v2:
> >   - add comments to connected/started fields in the header file
> >   - move the "s->started" logic from the vhost_user_blk_disconnect
> > routine to the vhost_user_blk_stop routine
> > 
> > Reference e-mail threads:
> >   - https://lists.gnu.org/archive/html/qemu-devel/2020-05/msg01509.html
> >   - https://lists.gnu.org/archive/html/qemu-devel/2020-05/msg05241.html
> > 
> > If vhost-user daemon is used as a backend for the vhost device, then we
> > should consider a possibility of disconnect at any moment. There was a 
> > general
> > question here: should we consider it as an error or okay state for the 
> > vhost-user
> > devices during migration process?
> > I think the disconnect event for the vhost-user devices should not break the
> > migration process, because:
> >   - the device will be in the stopped state, so it will not be changed
> > during migration
> >   - if reconnect will be made the migration log will be reinitialized as
> > part of reconnect/init process:
> > #0  vhost_log_global_start (listener=0x563989cf7be0)
> > at hw/virtio/vhost.c:920
> > #1  0x56398603d8bc in listener_add_address_space 
> > (listener=0x563989cf7be0,
> > as=0x563986ea4340 )
> > at softmmu/memory.c:2664
> > #2  0x56398603dd30 in memory_listener_register 
> > (listener=0x563989cf7be0,
> > as=0x563986ea4340 )
> > at softmmu/memory.c:2740
> > #3  0x563985fd6956 in vhost_dev_init (hdev=0x563989cf7bd8,
> > opaque=0x563989cf7e30, backend_type=VHOST_BACKEND_TYPE_USER,
> > busyloop_timeout=0)
> > at hw/virtio/vhost.c:1385
> > #4  0x563985f7d0b8 in vhost_user_blk_connect (dev=0x563989cf7990)
> > at hw/block/vhost-user-blk.c:315
> > #5  0x563985f7d3f6 in vhost_user_blk_event (opaque=0x563989cf7990,
> > event=CHR_EVENT_OPENED)
> > at hw/block/vhost-user-blk.c:379
> > The first patch in the patchset fixes this issue by setting vhost device to 
> > the
> > stopped state in the disconnect handler and check it the 
> > vhost_migration_log()
> > routine before returning from the function.
> > qtest framework was updated to test vhost-user-blk functionality. The
> > vhost-user-blk/vhost-user-blk-tests/migrate_reconnect test was added to 
> > reproduce
> > the original issue found.
> > 
> > Dima Stepanov (7):
> >   vhost: recheck dev state in the vhost_migration_log routine
> >   vhost: check queue state in the vhost_dev_set_log routine
> >   tests/qtest/vhost-user-test: prepare the tests for adding new dev
> > class
> >   tests/qtest/libqos/virtio-blk: add support for vhost-user-blk
> >   tests/qtest/vhost-user-test: add support for the vhost-user-blk device
> >   tests/qtest/vhost-user-test: add migrate_reconnect test
> >   tests/qtest/vhost-user-test: enable the reconnect tests
> > 
> >  hw/block/vhost-user-blk.c  |  19 ++-
> >  hw/virtio/vhost.c  |  39 -
> >  include/hw/virtio/vhost-user-blk.h |  10 ++
> >  tests/qtest/libqos/virtio-blk.c|  14 +-
> >  tests/qtest/vhost-user-test.c  | 290 
> > +++--
> >  5 files changed, 322 insertions(+), 50 deletions(-)
> > 
> > -- 
> > 2.7.4
>