From: Gollu Appalanaidu <anaidu.go...@samsung.com> Add support for marking blocks invalid with the Write Uncorrectable command. Block status is tracked in a (non-persistent) bitmap that is checked on all reads and written to on all writes. This is potentially expensive, so keep Write Uncorrectable disabled by default.
Signed-off-by: Gollu Appalanaidu <anaidu.go...@samsung.com> Signed-off-by: Klaus Jensen <k.jen...@samsung.com> --- docs/specs/nvme.txt | 3 ++ hw/block/nvme-ns.h | 2 ++ hw/block/nvme.h | 1 + hw/block/nvme-ns.c | 2 ++ hw/block/nvme.c | 65 +++++++++++++++++++++++++++++++++++++------ hw/block/trace-events | 1 + 6 files changed, 66 insertions(+), 8 deletions(-) diff --git a/docs/specs/nvme.txt b/docs/specs/nvme.txt index 56d393884e7a..88f9cc278d4c 100644 --- a/docs/specs/nvme.txt +++ b/docs/specs/nvme.txt @@ -19,5 +19,8 @@ Known issues * The accounting numbers in the SMART/Health are reset across power cycles +* Marking blocks invalid with the Write Uncorrectable is not persisted across + power cycles. + * Interrupt Coalescing is not supported and is disabled by default in volation of the specification. diff --git a/hw/block/nvme-ns.h b/hw/block/nvme-ns.h index 7af6884862b5..15fa422ded03 100644 --- a/hw/block/nvme-ns.h +++ b/hw/block/nvme-ns.h @@ -72,6 +72,8 @@ typedef struct NvmeNamespace { struct { uint32_t err_rec; } features; + + unsigned long *uncorrectable; } NvmeNamespace; static inline uint32_t nvme_nsid(NvmeNamespace *ns) diff --git a/hw/block/nvme.h b/hw/block/nvme.h index 98082b2dfba3..9b8f85b9cf16 100644 --- a/hw/block/nvme.h +++ b/hw/block/nvme.h @@ -68,6 +68,7 @@ static inline const char *nvme_io_opc_str(uint8_t opc) case NVME_CMD_FLUSH: return "NVME_NVM_CMD_FLUSH"; case NVME_CMD_WRITE: return "NVME_NVM_CMD_WRITE"; case NVME_CMD_READ: return "NVME_NVM_CMD_READ"; + case NVME_CMD_WRITE_UNCOR: return "NVME_CMD_WRITE_UNCOR"; case NVME_CMD_COMPARE: return "NVME_NVM_CMD_COMPARE"; case NVME_CMD_WRITE_ZEROES: return "NVME_NVM_CMD_WRITE_ZEROES"; case NVME_CMD_DSM: return "NVME_NVM_CMD_DSM"; diff --git a/hw/block/nvme-ns.c b/hw/block/nvme-ns.c index ade46e2f3739..742bbc4b4b62 100644 --- a/hw/block/nvme-ns.c +++ b/hw/block/nvme-ns.c @@ -72,6 +72,8 @@ static int nvme_ns_init(NvmeNamespace *ns, Error **errp) id_ns->mcl = cpu_to_le32(ns->params.mcl); id_ns->msrc = ns->params.msrc; + ns->uncorrectable = bitmap_new(id_ns->nsze); + return 0; } diff --git a/hw/block/nvme.c b/hw/block/nvme.c index e5f6666725d7..56048046c193 100644 --- a/hw/block/nvme.c +++ b/hw/block/nvme.c @@ -1112,6 +1112,20 @@ static uint16_t nvme_check_dulbe(NvmeNamespace *ns, uint64_t slba, return NVME_SUCCESS; } +static inline uint16_t nvme_check_uncor(NvmeNamespace *ns, uint64_t slba, + uint32_t nlb) +{ + uint64_t elba = nlb + slba; + + if (ns->uncorrectable) { + if (find_next_bit(ns->uncorrectable, elba, slba) < elba) { + return NVME_UNRECOVERED_READ | NVME_DNR; + } + } + + return NVME_SUCCESS; +} + static void nvme_aio_err(NvmeRequest *req, int ret) { uint16_t status = NVME_SUCCESS; @@ -1423,14 +1437,24 @@ static void nvme_rw_cb(void *opaque, int ret) BlockAcctCookie *acct = &req->acct; BlockAcctStats *stats = blk_get_stats(blk); + bool is_write = nvme_is_write(req); + trace_pci_nvme_rw_cb(nvme_cid(req), blk_name(blk)); - if (ns->params.zoned && nvme_is_write(req)) { + if (ns->params.zoned && is_write) { nvme_finalize_zoned_write(ns, req); } if (!ret) { block_acct_done(stats, acct); + + if (is_write) { + NvmeRwCmd *rw = (NvmeRwCmd *)&req->cmd; + uint64_t slba = le64_to_cpu(rw->slba); + uint32_t nlb = le16_to_cpu(rw->nlb) + 1; + + bitmap_clear(ns->uncorrectable, slba, nlb); + } } else { block_acct_failed(stats, acct); nvme_aio_err(req, ret); @@ -1521,13 +1545,13 @@ static void nvme_copy_cb(void *opaque, int ret) { NvmeRequest *req = opaque; NvmeNamespace *ns = req->ns; + NvmeCopyCmd *copy = (NvmeCopyCmd *)&req->cmd; + uint64_t sdlba = le64_to_cpu(copy->sdlba); struct nvme_copy_ctx *ctx = req->opaque; trace_pci_nvme_copy_cb(nvme_cid(req)); if (ns->params.zoned) { - NvmeCopyCmd *copy = (NvmeCopyCmd *)&req->cmd; - uint64_t sdlba = le64_to_cpu(copy->sdlba); NvmeZone *zone = nvme_get_zone_by_slba(ns, sdlba); __nvme_advance_zone_wp(ns, zone, ctx->nlb); @@ -1535,6 +1559,7 @@ static void nvme_copy_cb(void *opaque, int ret) if (!ret) { block_acct_done(blk_get_stats(ns->blkconf.blk), &req->acct); + bitmap_clear(ns->uncorrectable, sdlba, ctx->nlb); } else { block_acct_failed(blk_get_stats(ns->blkconf.blk), &req->acct); nvme_aio_err(req, ret); @@ -1953,6 +1978,12 @@ static uint16_t nvme_read(NvmeCtrl *n, NvmeRequest *req) goto invalid; } + status = nvme_check_uncor(ns, slba, nlb); + if (status) { + trace_pci_nvme_err_unrecoverable_read(slba, nlb); + return status; + } + if (ns->params.zoned) { status = nvme_check_zone_read(ns, slba, nlb); if (status) { @@ -1992,7 +2023,7 @@ invalid: } static uint16_t nvme_do_write(NvmeCtrl *n, NvmeRequest *req, bool append, - bool wrz) + bool wrz, bool uncor) { NvmeRwCmd *rw = (NvmeRwCmd *)&req->cmd; NvmeNamespace *ns = req->ns; @@ -2008,7 +2039,7 @@ static uint16_t nvme_do_write(NvmeCtrl *n, NvmeRequest *req, bool append, trace_pci_nvme_write(nvme_cid(req), nvme_io_opc_str(rw->opcode), nvme_nsid(ns), nlb, data_size, slba); - if (!wrz) { + if (!wrz && !uncor) { status = nvme_check_mdts(n, data_size); if (status) { trace_pci_nvme_err_mdts(nvme_cid(req), data_size); @@ -2055,6 +2086,11 @@ static uint16_t nvme_do_write(NvmeCtrl *n, NvmeRequest *req, bool append, zone->w_ptr += nlb; } + if (uncor) { + bitmap_set(ns->uncorrectable, slba, nlb); + return NVME_SUCCESS; + } + data_offset = nvme_l2b(ns, slba); if (!wrz) { @@ -2087,17 +2123,22 @@ invalid: static inline uint16_t nvme_write(NvmeCtrl *n, NvmeRequest *req) { - return nvme_do_write(n, req, false, false); + return nvme_do_write(n, req, false, false, false); } static inline uint16_t nvme_write_zeroes(NvmeCtrl *n, NvmeRequest *req) { - return nvme_do_write(n, req, false, true); + return nvme_do_write(n, req, false, true, false); } static inline uint16_t nvme_zone_append(NvmeCtrl *n, NvmeRequest *req) { - return nvme_do_write(n, req, true, false); + return nvme_do_write(n, req, true, false, false); +} + +static inline uint16_t nvme_write_uncor(NvmeCtrl *n, NvmeRequest *req) +{ + return nvme_do_write(n, req, false, false, true); } static uint16_t nvme_get_mgmt_zone_slba_idx(NvmeNamespace *ns, NvmeCmd *c, @@ -2596,6 +2637,8 @@ static uint16_t nvme_io_cmd(NvmeCtrl *n, NvmeRequest *req) return nvme_flush(n, req); case NVME_CMD_WRITE_ZEROES: return nvme_write_zeroes(n, req); + case NVME_CMD_WRITE_UNCOR: + return nvme_write_uncor(n, req); case NVME_CMD_ZONE_APPEND: return nvme_zone_append(n, req); case NVME_CMD_WRITE: @@ -4514,6 +4557,11 @@ static void nvme_init_cse_iocs(NvmeCtrl *n) n->iocs.nvm[NVME_CMD_WRITE] = NVME_CMD_EFF_CSUPP | NVME_CMD_EFF_LBCC; n->iocs.nvm[NVME_CMD_READ] = NVME_CMD_EFF_CSUPP; + if (oncs & NVME_ONCS_WRITE_UNCORR) { + n->iocs.nvm[NVME_CMD_WRITE_UNCOR] = NVME_CMD_EFF_CSUPP | + NVME_CMD_EFF_LBCC; + } + if (oncs & NVME_ONCS_WRITE_ZEROES) { n->iocs.nvm[NVME_CMD_WRITE_ZEROES] = NVME_CMD_EFF_CSUPP | NVME_CMD_EFF_LBCC; @@ -4853,6 +4901,7 @@ static void nvme_exit(PCIDevice *pci_dev) } nvme_ns_cleanup(ns); + g_free(ns->uncorrectable); } g_free(n->cq); diff --git a/hw/block/trace-events b/hw/block/trace-events index 4b5ee04024f4..f30ef220c26a 100644 --- a/hw/block/trace-events +++ b/hw/block/trace-events @@ -128,6 +128,7 @@ pci_nvme_err_invalid_prp2_align(uint64_t prp2) "PRP2 is not page aligned: 0x%"PR pci_nvme_err_invalid_opc(uint8_t opc) "invalid opcode 0x%"PRIx8"" pci_nvme_err_invalid_admin_opc(uint8_t opc) "invalid admin opcode 0x%"PRIx8"" pci_nvme_err_invalid_lba_range(uint64_t start, uint64_t len, uint64_t limit) "Invalid LBA start=%"PRIu64" len=%"PRIu64" limit=%"PRIu64"" +pci_nvme_err_unrecoverable_read(uint64_t start, uint32_t nlb) "islba 0x%"PRIx64" nlb %"PRIu32"" pci_nvme_err_invalid_log_page_offset(uint64_t ofs, uint64_t size) "must be <= %"PRIu64", got %"PRIu64"" pci_nvme_err_cmb_invalid_cba(uint64_t cmbmsc) "cmbmsc 0x%"PRIx64"" pci_nvme_err_cmb_not_enabled(uint64_t cmbmsc) "cmbmsc 0x%"PRIx64"" -- 2.30.0