From: Klaus Jensen <k.jen...@samsung.com> Signed-off-by: Klaus Jensen <k.jen...@samsung.com> --- docs/specs/nvme.txt | 15 +++ hw/block/nvme-ns.h | 16 ++++ hw/block/nvme-ns.c | 212 +++++++++++++++++++++++++++++++++++++++++- hw/block/nvme.c | 87 +++++++++++++++++ hw/block/trace-events | 2 + 5 files changed, 331 insertions(+), 1 deletion(-)
diff --git a/docs/specs/nvme.txt b/docs/specs/nvme.txt index 03bb4d9516b4..05d81c88ad4e 100644 --- a/docs/specs/nvme.txt +++ b/docs/specs/nvme.txt @@ -20,6 +20,21 @@ The nvme device (-device nvme) emulates an NVM Express Controller. `zns.mor`; Specifies the number of open resources available. This is a 0s based value. + `zns.pstate`; This parameter specifies another blockdev to be used for + storing zone state persistently. + + -drive id=zns-pstate,file=zns-pstate.img,format=raw + -device nvme-ns,zns.pstate=zns-pstate,... + + To reset (or initialize) state, the blockdev image should be of zero size: + + qemu-img create -f raw zns-pstate.img 0 + + The image will be intialized with a file format header and truncated to + the required size. If the pstate given is of non-zero size, it will be + assumed to already contain zone state information and the header will be + checked. + Reference Specifications ------------------------ diff --git a/hw/block/nvme-ns.h b/hw/block/nvme-ns.h index 05a79a214605..5cb4c1da59ce 100644 --- a/hw/block/nvme-ns.h +++ b/hw/block/nvme-ns.h @@ -19,6 +19,15 @@ #define NVME_NS(obj) \ OBJECT_CHECK(NvmeNamespace, (obj), TYPE_NVME_NS) +#define NVME_ZONE_PSTATE_MAGIC ((0x00 << 24) | ('S' << 16) | ('N' << 8) | 'Z') +#define NVME_ZONE_PSTATE_V1 1 + +typedef struct NvmeZonePStateHeader { + uint32_t magic; + uint32_t version; + uint8_t rsvd8[4088]; +} QEMU_PACKED NvmeZonePStateHeader; + typedef struct NvmeNamespaceParams { uint32_t nsid; uint8_t iocs; @@ -74,6 +83,8 @@ typedef struct NvmeNamespace { QTAILQ_HEAD(, NvmeZone) lru_open; QTAILQ_HEAD(, NvmeZone) lru_active; } resources; + + BlockBackend *pstate; } zns; } NvmeNamespace; @@ -186,4 +197,9 @@ int nvme_ns_setup(NvmeCtrl *n, NvmeNamespace *ns, Error **errp); void nvme_ns_drain(NvmeNamespace *ns); void nvme_ns_flush(NvmeNamespace *ns); +static inline void _nvme_ns_check_size(void) +{ + QEMU_BUILD_BUG_ON(sizeof(NvmeZonePStateHeader) != 4096); +} + #endif /* NVME_NS_H */ diff --git a/hw/block/nvme-ns.c b/hw/block/nvme-ns.c index cd0f075dd281..4f311dd704c0 100644 --- a/hw/block/nvme-ns.c +++ b/hw/block/nvme-ns.c @@ -50,6 +50,31 @@ const char *nvme_zs_to_str(NvmeZoneState zs) return "UNKNOWN"; } +static int nvme_blk_truncate(BlockBackend *blk, size_t len, Error **errp) +{ + int ret; + uint64_t perm, shared_perm; + + blk_get_perm(blk, &perm, &shared_perm); + + ret = blk_set_perm(blk, perm | BLK_PERM_RESIZE, shared_perm, errp); + if (ret < 0) { + return ret; + } + + ret = blk_truncate(blk, len, false, PREALLOC_MODE_OFF, 0, errp); + if (ret < 0) { + return ret; + } + + ret = blk_set_perm(blk, perm, shared_perm, errp); + if (ret < 0) { + return ret; + } + + return 0; +} + static void nvme_ns_zns_init_zones(NvmeNamespace *ns) { NvmeZone *zone; @@ -153,6 +178,176 @@ static int nvme_ns_init(NvmeNamespace *ns, Error **errp) return 0; } +static int nvme_ns_zns_restore_zone_state(NvmeNamespace *ns, Error **errp) +{ + for (int i = 0; i < ns->zns.num_zones; i++) { + NvmeZone *zone = &ns->zns.zones[i]; + zone->zd = &ns->zns.zd[i]; + if (ns->params.zns.zdes) { + zone->zde = &ns->zns.zde[i]; + } + + switch (nvme_zs(zone)) { + case NVME_ZS_ZSE: + case NVME_ZS_ZSF: + case NVME_ZS_ZSRO: + case NVME_ZS_ZSO: + break; + + case NVME_ZS_ZSC: + if (nvme_wp(zone) == nvme_zslba(zone) && + !(zone->zd->za & NVME_ZA_ZDEV)) { + nvme_zs_set(zone, NVME_ZS_ZSE); + break; + } + + if (ns->zns.resources.active) { + ns->zns.resources.active--; + QTAILQ_INSERT_TAIL(&ns->zns.resources.lru_active, zone, + lru_entry); + break; + } + + /* fallthrough */ + + case NVME_ZS_ZSIO: + case NVME_ZS_ZSEO: + zone->zd->wp = zone->zd->zslba; + nvme_zs_set(zone, NVME_ZS_ZSF); + break; + + default: + error_setg(errp, "invalid zone state"); + return -1; + } + + zone->wp_staging = nvme_wp(zone); + } + + return 0; +} + +static int nvme_ns_zns_init_pstate(NvmeNamespace *ns, Error **errp) +{ + BlockBackend *blk = ns->zns.pstate; + NvmeZonePStateHeader header; + size_t zd_len, zde_len; + int ret; + + zd_len = ns->zns.num_zones * sizeof(NvmeZoneDescriptor); + zde_len = ns->zns.num_zones * nvme_ns_zdes_bytes(ns); + + ret = nvme_blk_truncate(blk, zd_len + zde_len + sizeof(header), errp); + if (ret < 0) { + error_setg_errno(errp, -ret, "could not truncate zone pstate"); + return ret; + } + + nvme_ns_zns_init_zones(ns); + + ret = blk_pwrite(blk, 0, ns->zns.zd, zd_len, 0); + if (ret < 0) { + error_setg_errno(errp, -ret, "could not write zone descriptors to " + "zone pstate"); + return ret; + } + + header = (NvmeZonePStateHeader) { + .magic = cpu_to_le32(NVME_ZONE_PSTATE_MAGIC), + .version = cpu_to_le32(NVME_ZONE_PSTATE_V1), + }; + + ret = blk_pwrite(blk, zd_len + zde_len, &header, sizeof(header), 0); + if (ret < 0) { + error_setg_errno(errp, -ret, "could not write zone pstate header"); + return ret; + } + + return 0; +} + +static int nvme_ns_zns_load_pstate(NvmeNamespace *ns, size_t len, Error **errp) +{ + BlockBackend *blk = ns->zns.pstate; + NvmeZonePStateHeader header; + size_t zd_len, zde_len; + int ret; + + ret = blk_pread(blk, len - sizeof(header), &header, sizeof(header)); + if (ret < 0) { + error_setg_errno(errp, -ret, "could not read zone pstate header"); + return ret; + } + + if (le32_to_cpu(header.magic) != NVME_ZONE_PSTATE_MAGIC) { + error_setg(errp, "invalid zone pstate header"); + return -1; + } else if (le32_to_cpu(header.version) > NVME_ZONE_PSTATE_V1) { + error_setg(errp, "unsupported zone pstate version"); + return -1; + } + + zd_len = ns->zns.num_zones * sizeof(NvmeZoneDescriptor); + zde_len = ns->zns.num_zones * nvme_ns_zdes_bytes(ns); + + ret = blk_pread(blk, 0, ns->zns.zd, zd_len); + if (ret < 0) { + error_setg_errno(errp, -ret, "could not read zone descriptors from " + "zone pstate"); + return ret; + } + + if (zde_len) { + ret = blk_pread(blk, zd_len, ns->zns.zde, zde_len); + if (ret < 0) { + error_setg_errno(errp, -ret, "could not read zone descriptor " + "extensions from zone pstate"); + return ret; + } + } + + if (nvme_ns_zns_restore_zone_state(ns, errp)) { + return -1; + } + + ret = blk_pwrite(blk, 0, ns->zns.zd, zd_len, 0); + if (ret < 0) { + error_setg_errno(errp, -ret, "could not write zone descriptors to " + "zone pstate"); + return ret; + } + + return 0; +} + +static int nvme_ns_zns_setup_pstate(NvmeNamespace *ns, Error **errp) +{ + BlockBackend *blk = ns->zns.pstate; + uint64_t perm, shared_perm; + ssize_t len; + int ret; + + perm = BLK_PERM_CONSISTENT_READ | BLK_PERM_WRITE; + shared_perm = BLK_PERM_ALL; + + ret = blk_set_perm(blk, perm, shared_perm, errp); + if (ret) { + return ret; + } + + len = blk_getlength(blk); + if (len < 0) { + error_setg_errno(errp, -len, "could not determine zone pstate size"); + return len; + } + + if (!len) { + return nvme_ns_zns_init_pstate(ns, errp); + } + + return nvme_ns_zns_load_pstate(ns, len, errp); +} + static int nvme_ns_init_blk(NvmeCtrl *n, NvmeNamespace *ns, Error **errp) { if (!blkconf_blocksizes(&ns->blkconf, errp)) { @@ -236,7 +431,13 @@ int nvme_ns_setup(NvmeCtrl *n, NvmeNamespace *ns, Error **errp) } if (nvme_ns_zoned(ns)) { - nvme_ns_zns_init_zones(ns); + if (ns->zns.pstate) { + if (nvme_ns_zns_setup_pstate(ns, errp)) { + return -1; + } + } else { + nvme_ns_zns_init_zones(ns); + } } if (nvme_register_namespace(n, ns, errp)) { @@ -249,11 +450,19 @@ int nvme_ns_setup(NvmeCtrl *n, NvmeNamespace *ns, Error **errp) void nvme_ns_drain(NvmeNamespace *ns) { blk_drain(ns->blkconf.blk); + + if (ns->zns.pstate) { + blk_drain(ns->zns.pstate); + } } void nvme_ns_flush(NvmeNamespace *ns) { blk_flush(ns->blkconf.blk); + + if (ns->zns.pstate) { + blk_flush(ns->zns.pstate); + } } static void nvme_ns_realize(DeviceState *dev, Error **errp) @@ -283,6 +492,7 @@ static Property nvme_ns_props[] = { DEFINE_PROP_UINT8("zns.zdes", NvmeNamespace, params.zns.zdes, 0), DEFINE_PROP_UINT32("zns.mar", NvmeNamespace, params.zns.mar, 0xffffffff), DEFINE_PROP_UINT32("zns.mor", NvmeNamespace, params.zns.mor, 0xffffffff), + DEFINE_PROP_DRIVE("zns.pstate", NvmeNamespace, zns.pstate), DEFINE_PROP_END_OF_LIST(), }; diff --git a/hw/block/nvme.c b/hw/block/nvme.c index e62efd7cf0c4..04ad9f20223d 100644 --- a/hw/block/nvme.c +++ b/hw/block/nvme.c @@ -1023,6 +1023,46 @@ static uint16_t nvme_check_dulbe(NvmeNamespace *ns, uint64_t slba, return NVME_SUCCESS; } +static int nvme_zns_commit_zone(NvmeNamespace *ns, NvmeZone *zone) +{ + uint64_t zslba; + int64_t offset; + + if (!ns->zns.pstate) { + return 0; + } + + trace_pci_nvme_zns_commit_zone(nvme_nsid(ns), nvme_zslba(zone)); + + zslba = nvme_zslba(zone); + offset = nvme_ns_zone_idx(ns, zslba) * sizeof(NvmeZoneDescriptor); + + return blk_pwrite(ns->zns.pstate, offset, zone->zd, + sizeof(NvmeZoneDescriptor), 0); +} + +static int nvme_zns_commit_zde(NvmeNamespace *ns, NvmeZone *zone) +{ + uint64_t zslba; + int zidx; + size_t zd_len, zdes_bytes; + int64_t offset; + + if (!ns->zns.pstate) { + return 0; + } + + trace_pci_nvme_zns_commit_zde(nvme_nsid(ns), nvme_zslba(zone)); + + zd_len = ns->zns.num_zones * sizeof(NvmeZoneDescriptor); + zslba = nvme_zslba(zone); + zidx = nvme_ns_zone_idx(ns, zslba); + zdes_bytes = nvme_ns_zdes_bytes(ns); + offset = zd_len + zidx * zdes_bytes; + + return blk_pwrite(ns->zns.pstate, offset, zone->zde, zdes_bytes, 0); +} + static inline void nvme_zone_reset_wp(NvmeZone *zone) { zone->zd->wp = zone->zd->zslba; @@ -1058,6 +1098,10 @@ static uint16_t nvme_zrm_release_open(NvmeNamespace *ns) return status; } + if (nvme_zns_commit_zone(ns, candidate) < 0) { + return NVME_INTERNAL_DEV_ERROR; + } + return NVME_SUCCESS; } @@ -1252,6 +1296,10 @@ static uint16_t __nvme_zns_advance_wp(NvmeNamespace *ns, NvmeZone *zone, if (status) { return status; } + + if (nvme_zns_commit_zone(ns, zone) < 0) { + return NVME_INTERNAL_DEV_ERROR; + } } return NVME_SUCCESS; @@ -1307,6 +1355,10 @@ static void nvme_aio_err(NvmeRequest *req, int ret, NvmeZone *zone) NVME_ZS_ZSRO : NVME_ZS_ZSO; nvme_zrm_transition(ns, zone, zs); + + if (nvme_zns_commit_zone(req->ns, zone) < 0) { + req->status = NVME_INTERNAL_DEV_ERROR; + } } /* @@ -1618,6 +1670,10 @@ static void nvme_aio_zone_reset_cb(void *opaque, int ret) nvme_aio_err(req, ret, zone); } + if (nvme_zns_commit_zone(req->ns, zone) < 0) { + req->status = NVME_INTERNAL_DEV_ERROR; + } + (*resets)--; if (*resets) { @@ -1657,6 +1713,10 @@ static uint16_t nvme_zone_mgmt_send_close(NvmeCtrl *n, NvmeRequest *req, return status; } + if (nvme_zns_commit_zone(ns, zone) < 0) { + return NVME_INTERNAL_DEV_ERROR; + } + return NVME_SUCCESS; } @@ -1678,6 +1738,10 @@ static uint16_t nvme_zone_mgmt_send_finish(NvmeCtrl *n, NvmeRequest *req, return status; } + if (nvme_zns_commit_zone(ns, zone) < 0) { + return NVME_INTERNAL_DEV_ERROR; + } + return NVME_SUCCESS; } @@ -1699,6 +1763,10 @@ static uint16_t nvme_zone_mgmt_send_open(NvmeCtrl *n, NvmeRequest *req, return status; } + if (nvme_zns_commit_zone(ns, zone) < 0) { + return NVME_INTERNAL_DEV_ERROR; + } + return NVME_SUCCESS; } @@ -1754,6 +1822,10 @@ static uint16_t nvme_zone_mgmt_send_offline(NvmeCtrl *n, NvmeRequest *req, case NVME_ZS_ZSRO: nvme_zrm_transition(ns, zone, NVME_ZS_ZSO); + if (nvme_zns_commit_zone(ns, zone) < 0) { + return NVME_INTERNAL_DEV_ERROR; + } + /* fallthrough */ case NVME_ZS_ZSO: @@ -1793,6 +1865,10 @@ static uint16_t nvme_zone_mgmt_send_set_zde(NvmeCtrl *n, NvmeRequest *req, return status; } + if (nvme_zns_commit_zde(ns, zone) < 0) { + return NVME_INTERNAL_DEV_ERROR; + } + status = nvme_zrm_transition(ns, zone, NVME_ZS_ZSC); if (status) { return status; @@ -1800,6 +1876,10 @@ static uint16_t nvme_zone_mgmt_send_set_zde(NvmeCtrl *n, NvmeRequest *req, NVME_ZA_SET(zone->zd->za, NVME_ZA_ZDEV); + if (nvme_zns_commit_zone(ns, zone) < 0) { + return NVME_INTERNAL_DEV_ERROR; + } + return NVME_SUCCESS; } @@ -2502,6 +2582,11 @@ static uint16_t nvme_write(NvmeCtrl *n, NvmeRequest *req) goto invalid; } + if (nvme_zns_commit_zone(ns, zone) < 0) { + status = NVME_INTERNAL_DEV_ERROR; + goto invalid; + } + break; } @@ -3778,6 +3863,8 @@ static void nvme_ctrl_shutdown(NvmeCtrl *n) nvme_zrm_transition(ns, zone, NVME_ZS_ZSE); } + nvme_zns_commit_zone(ns, zone); + /* fallthrough */ default: diff --git a/hw/block/trace-events b/hw/block/trace-events index 31482bfba1fe..aa5491c398b9 100644 --- a/hw/block/trace-events +++ b/hw/block/trace-events @@ -96,6 +96,8 @@ pci_nvme_enqueue_req_completion(uint16_t cid, uint16_t cqid, uint16_t status) "c pci_nvme_zrm_transition(uint32_t nsid, uint64_t zslba, const char *s_from, uint8_t from, const char *s_to, uint8_t to) "nsid %"PRIu32" zslba 0x%"PRIx64" from '%s' (%"PRIu8") to '%s' (%"PRIu8")" pci_nvme_zrm_release_open(uint32_t nsid) "nsid %"PRIu32"" pci_nvme_zns_advance_wp(uint32_t nsid, uint64_t zslba, uint64_t wp_orig, uint32_t nlb) "nsid 0x%"PRIx32" zslba 0x%"PRIx64" wp_orig 0x%"PRIx64" nlb %"PRIu32"" +pci_nvme_zns_commit_zone(uint32_t nsid, uint64_t zslba) "nsid 0x%"PRIx32" zslba 0x%"PRIx64"" +pci_nvme_zns_commit_zde(uint32_t nsid, uint64_t zslba) "nsid 0x%"PRIx32" zslba 0x%"PRIx64"" pci_nvme_mmio_read(uint64_t addr) "addr 0x%"PRIx64"" pci_nvme_mmio_write(uint64_t addr, uint64_t data) "addr 0x%"PRIx64" data 0x%"PRIx64"" pci_nvme_mmio_doorbell_cq(uint16_t cqid, uint16_t new_head) "cqid %"PRIu16" new_head %"PRIu16"" -- 2.29.2