from:"Minwoo Im"

[PATCH 2/2] hw/ufs: Add support MCQ of UFSHCI 4.0

2024-05-21 Thread Minwoo Im

This patch adds support for MCQ defined in UFSHCI 4.0.  This patch
utilized the legacy I/O codes as much as possible to support MCQ.

MCQ operation & runtime register is placed at 0x1000 offset of UFSHCI
register statically with no spare space among four registers (48B):

UfsMcqSqReg, UfsMcqSqIntReg, UfsMcqCqReg, UfsMcqCqIntReg

The maxinum number of queue is 32 as per spec, and the default
MAC(Multiple Active Commands) are 32 in the device.

Example:
-device ufs,serial=foo,id=ufs0,mcq=true,mcq-maxq=8

Signed-off-by: Minwoo Im 
---
 hw/ufs/trace-events |  17 ++
 hw/ufs/ufs.c| 474 ++--
 hw/ufs/ufs.h|  98 -
 include/block/ufs.h |  23 ++-
 4 files changed, 592 insertions(+), 20 deletions(-)

diff --git a/hw/ufs/trace-events b/hw/ufs/trace-events
index 665e1a942b..dda7f8a2e5 100644
--- a/hw/ufs/trace-events
+++ b/hw/ufs/trace-events
@@ -11,13 +11,18 @@ ufs_exec_nop_cmd(uint32_t slot) "UTRLDBR slot %"PRIu32""
 ufs_exec_scsi_cmd(uint32_t slot, uint8_t lun, uint8_t opcode) "slot %"PRIu32", 
lun 0x%"PRIx8", opcode 0x%"PRIx8""
 ufs_exec_query_cmd(uint32_t slot, uint8_t opcode) "slot %"PRIu32", opcode 
0x%"PRIx8""
 ufs_process_uiccmd(uint32_t uiccmd, uint32_t ucmdarg1, uint32_t ucmdarg2, 
uint32_t ucmdarg3) "uiccmd 0x%"PRIx32", ucmdarg1 0x%"PRIx32", ucmdarg2 
0x%"PRIx32", ucmdarg3 0x%"PRIx32""
+ufs_mcq_complete_req(uint8_t qid) "sqid %"PRIu8""
+ufs_mcq_create_sq(uint8_t sqid, uint8_t cqid, uint64_t addr, uint16_t size) 
"mcq create sq sqid %"PRIu8", cqid %"PRIu8", addr 0x%"PRIx64", size %"PRIu16""
+ufs_mcq_create_cq(uint8_t cqid, uint64_t addr, uint16_t size) "mcq create cq 
cqid %"PRIu8", addr 0x%"PRIx64", size %"PRIu16""
 
 # error condition
 ufs_err_dma_read_utrd(uint32_t slot, uint64_t addr) "failed to read utrd. 
UTRLDBR slot %"PRIu32", UTRD dma addr %"PRIu64""
 ufs_err_dma_read_req_upiu(uint32_t slot, uint64_t addr) "failed to read req 
upiu. UTRLDBR slot %"PRIu32", request upiu addr %"PRIu64""
 ufs_err_dma_read_prdt(uint32_t slot, uint64_t addr) "failed to read prdt. 
UTRLDBR slot %"PRIu32", prdt addr %"PRIu64""
+ufs_err_dma_read_sq(uint8_t qid, uint64_t addr) "failed to read sqe. SQ qid 
%"PRIu8", sqe addr %"PRIu64""
 ufs_err_dma_write_utrd(uint32_t slot, uint64_t addr) "failed to write utrd. 
UTRLDBR slot %"PRIu32", UTRD dma addr %"PRIu64""
 ufs_err_dma_write_rsp_upiu(uint32_t slot, uint64_t addr) "failed to write rsp 
upiu. UTRLDBR slot %"PRIu32", response upiu addr %"PRIu64""
+ufs_err_dma_write_cq(uint32_t cqid, uint64_t addr) "failed to write cq entry. 
cqid %"PRIu8", hwaddr %"PRIu64""
 ufs_err_utrl_slot_error(uint32_t slot) "UTRLDBR slot %"PRIu32" is in error"
 ufs_err_utrl_slot_busy(uint32_t slot) "UTRLDBR slot %"PRIu32" is busy"
 ufs_err_unsupport_register_offset(uint32_t offset) "Register offset 
0x%"PRIx32" is not yet supported"
@@ -31,3 +36,15 @@ ufs_err_query_invalid_opcode(uint8_t opcode) "query request 
has invalid opcode.
 ufs_err_query_invalid_idn(uint8_t opcode, uint8_t idn) "query request has 
invalid idn. opcode: 0x%"PRIx8", idn 0x%"PRIx8""
 ufs_err_query_invalid_index(uint8_t opcode, uint8_t index) "query request has 
invalid index. opcode: 0x%"PRIx8", index 0x%"PRIx8""
 ufs_err_invalid_trans_code(uint32_t slot, uint8_t trans_code) "request upiu 
has invalid transaction code. slot: %"PRIu32", trans_code: 0x%"PRIx8""
+ufs_err_mcq_db_wr_invalid_sqid(uint8_t qid) "invalid mcq sqid %"PRIu8""
+ufs_err_mcq_db_wr_invalid_db(uint8_t qid, uint32_t db) "invalid mcq doorbell 
sqid %"PRIu8", db %"PRIu32""
+ufs_err_mcq_create_sq_invalid_sqid(uint8_t qid) "invalid mcq sqid %"PRIu8""
+ufs_err_mcq_create_sq_invalid_cqid(uint8_t qid) "invalid mcq cqid %"PRIu8""
+ufs_err_mcq_create_sq_already_exists(uint8_t qid) "mcq sqid %"PRIu8 "already 
exists"
+ufs_err_mcq_delete_sq_invalid_sqid(uint8_t qid) "invalid mcq sqid %"PRIu8""
+ufs_err_mcq_delete_sq_not_exists(uint8_t qid) "mcq sqid %"PRIu8 "not exists"
+ufs_err_mcq_create_cq_invalid_cqid(uint8_t qid) "invalid mcq cqid %"PRIu8""
+ufs_err_mcq_create_cq_already_exists(uint8_t qid) "mcq cqid %"PRIu8 "already 
exists"
+ufs_err_mcq_delete_cq_invalid_cqid(uint8_t qid) "invalid mcq cqid

[PATCH 0/2] hw/ufs: Add support MCQ

2024-05-21 Thread Minwoo Im

UFSHCI 4.0 spec introduced MCQ(Multi-Circular Queue) to support multiple
command queues for UFS controller.  To test ufs-mcq path of kernel, MCQ
emulated device would be a good choice to go with.

The first patch added newly introduced fields in UFSHCI 4.0 to support
MCQ.  The other one made the actual changes for MCQ.

Please review.

Thanks,

Minwoo Im (2):
  hw/ufs: Update MCQ-related fields to block/ufs.h
  hw/ufs: Add support MCQ of UFSHCI 4.0

 hw/ufs/trace-events |  17 ++
 hw/ufs/ufs.c| 474 ++--
 hw/ufs/ufs.h|  98 -
 include/block/ufs.h | 131 +++-
 4 files changed, 698 insertions(+), 22 deletions(-)

-- 
2.34.1

[PATCH 1/2] hw/ufs: Update MCQ-related fields to block/ufs.h

2024-05-21 Thread Minwoo Im

This patch is a prep patch for the following MCQ support patch for
hw/ufs.  This patch updated minimal mandatory fields to support MCQ
based on UFSHCI 4.0.

Signed-off-by: Minwoo Im 
---
 include/block/ufs.h | 108 +++-
 1 file changed, 106 insertions(+), 2 deletions(-)

diff --git a/include/block/ufs.h b/include/block/ufs.h
index d61598b8f3..3513b6e772 100644
--- a/include/block/ufs.h
+++ b/include/block/ufs.h
@@ -7,7 +7,7 @@
 
 typedef struct QEMU_PACKED UfsReg {
 uint32_t cap;
-uint32_t rsvd0;
+uint32_t mcqcap;
 uint32_t ver;
 uint32_t rsvd1;
 uint32_t hcpid;
@@ -46,6 +46,13 @@ typedef struct QEMU_PACKED UfsReg {
 uint32_t rsvd7[4];
 uint32_t rsvd8[16];
 uint32_t ccap;
+uint32_t rsvd9[127];
+uint32_t config;
+uint32_t rsvd10[3];
+uint32_t rsvd11[28];
+uint32_t mcqconfig;
+uint32_t esilba;
+uint32_t esiuba;
 } UfsReg;
 
 REG32(CAP, offsetof(UfsReg, cap))
@@ -57,6 +64,15 @@ REG32(CAP, offsetof(UfsReg, cap))
 FIELD(CAP, OODDS, 25, 1)
 FIELD(CAP, UICDMETMS, 26, 1)
 FIELD(CAP, CS, 28, 1)
+FIELD(CAP, LSDBS, 29, 1)
+FIELD(CAP, MCQS, 30, 1)
+REG32(MCQCAP, offsetof(UfsReg, mcqcap))
+FIELD(MCQCAP, MAXQ, 0, 8)
+FIELD(MCQCAP, SP, 8, 1)
+FIELD(MCQCAP, RRP, 9, 1)
+FIELD(MCQCAP, EIS, 10, 1)
+FIELD(MCQCAP, QCFGPTR, 16, 8)
+FIELD(MCQCAP, MIAG, 24, 8)
 REG32(VER, offsetof(UfsReg, ver))
 REG32(HCPID, offsetof(UfsReg, hcpid))
 REG32(HCMID, offsetof(UfsReg, hcmid))
@@ -78,6 +94,7 @@ REG32(IS, offsetof(UfsReg, is))
 FIELD(IS, HCFES, 16, 1)
 FIELD(IS, SBFES, 17, 1)
 FIELD(IS, CEFES, 18, 1)
+FIELD(IS, CQES, 20, 1)
 REG32(IE, offsetof(UfsReg, ie))
 FIELD(IE, UTRCE, 0, 1)
 FIELD(IE, UDEPRIE, 1, 1)
@@ -95,6 +112,7 @@ REG32(IE, offsetof(UfsReg, ie))
 FIELD(IE, HCFEE, 16, 1)
 FIELD(IE, SBFEE, 17, 1)
 FIELD(IE, CEFEE, 18, 1)
+FIELD(IE, CQEE, 20, 1)
 REG32(HCS, offsetof(UfsReg, hcs))
 FIELD(HCS, DP, 0, 1)
 FIELD(HCS, UTRLRDY, 1, 1)
@@ -128,6 +146,10 @@ REG32(UCMDARG1, offsetof(UfsReg, ucmdarg1))
 REG32(UCMDARG2, offsetof(UfsReg, ucmdarg2))
 REG32(UCMDARG3, offsetof(UfsReg, ucmdarg3))
 REG32(CCAP, offsetof(UfsReg, ccap))
+REG32(CONFIG, offsetof(UfsReg, config))
+FIELD(CONFIG, QT, 0, 1)
+REG32(MCQCONFIG, offsetof(UfsReg, mcqconfig))
+FIELD(MCQCONFIG, MAC, 8, 8)
 
 #define UFS_INTR_MASK\
 ((1 << R_IS_CEFES_SHIFT) | (1 << R_IS_SBFES_SHIFT) | \
@@ -157,6 +179,69 @@ REG32(CCAP, offsetof(UfsReg, ccap))
 ((be32_to_cpu(dword2) >> UFS_UPIU_HEADER_DATA_SEGMENT_LENGTH_SHIFT) & \
  UFS_UPIU_HEADER_DATA_SEGMENT_LENGTH_MASK)
 
+typedef struct QEMU_PACKED UfsMcqReg {
+uint32_t sqattr;
+uint32_t sqlba;
+uint32_t squba;
+uint32_t sqdao;
+uint32_t sqisao;
+uint32_t sqcfg;
+uint32_t rsvd0[2];
+uint32_t cqattr;
+uint32_t cqlba;
+uint32_t cquba;
+uint32_t cqdao;
+uint32_t cqisao;
+uint32_t cqcfg;
+uint32_t rsvd1[2];
+} UfsMcqReg;
+
+REG32(SQATTR, offsetof(UfsMcqReg, sqattr))
+FIELD(SQATTR, SIZE, 0, 16)
+FIELD(SQATTR, CQID, 16, 8)
+FIELD(SQATTR, SQPL, 28, 3)
+FIELD(SQATTR, SQEN, 31, 1)
+REG32(SQLBA, offsetof(UfsMcqReg, sqlba))
+REG32(SQUBA, offsetof(UfsMcqReg, squba))
+REG32(SQDAO, offsetof(UfsMcqReg, sqdao))
+REG32(SQISAO, offsetof(UfsMcqReg, sqisao))
+REG32(SQCFG, offsetof(UfsMcqReg, sqcfg))
+REG32(CQATTR, offsetof(UfsMcqReg, cqattr))
+FIELD(CQATTR, SIZE, 0, 16)
+FIELD(CQATTR, CQEN, 31, 1)
+REG32(CQLBA, offsetof(UfsMcqReg, cqlba))
+REG32(CQUBA, offsetof(UfsMcqReg, cquba))
+REG32(CQDAO, offsetof(UfsMcqReg, cqdao))
+REG32(CQISAO, offsetof(UfsMcqReg, cqisao))
+REG32(CQCFG, offsetof(UfsMcqReg, cqcfg))
+
+typedef struct QEMU_PACKED UfsMcqSqReg {
+uint32_t hp;
+uint32_t tp;
+uint32_t rtc;
+uint32_t cti;
+uint32_t rts;
+} UfsMcqSqReg;
+
+typedef struct QEMU_PACKED UfsMcqCqReg {
+uint32_t hp;
+uint32_t tp;
+} UfsMcqCqReg;
+
+typedef struct QEMU_PACKED UfsMcqSqIntReg {
+uint32_t is;
+uint32_t ie;
+} UfsMcqSqIntReg;
+
+typedef struct QEMU_PACKED UfsMcqCqIntReg {
+uint32_t is;
+uint32_t ie;
+uint32_t iacr;
+} UfsMcqCqIntReg;
+
+REG32(CQIS, offsetof(UfsMcqCqIntReg, is))
+FIELD(CQIS, TEPS, 0, 1)
+
 typedef struct QEMU_PACKED DeviceDescriptor {
 uint8_t length;
 uint8_t descriptor_idn;
@@ -1064,9 +1149,26 @@ typedef struct QEMU_PACKED UtpUpiuRsp {
 };
 } UtpUpiuRsp;
 
+/*
+ * MCQ Completion Queue Entry
+ */
+typedef UtpTransferReqDesc UfsSqEntry;
+typedef struct QEMU_PACKED UfsCqEntry {
+uint64_t utp_addr;
+uint16_t resp_len;
+uint16_t resp_off;
+uint16_t prdt_len;
+uint16_t prdt_off;
+uint8_t status;
+uint8_t error;
+uint16_t rsvd1;
+uint32_t rsvd2[3];
+} UfsCqEntry;
+
 static inline void _ufs_check_size(void)
 {
-QEMU_BUILD_BUG_ON(sizeof(UfsReg) != 0x104);
+QEMU_BUILD_BUG_ON(sizeof(UfsReg) != 0x38C);
+QE

[PATCH 0/2] hw/ufs: Add support MCQ

2024-05-21 Thread Minwoo Im

UFSHCI 4.0 spec introduced MCQ(Multi-Circular Queue) to support multiple
command queues for UFS controller.  To test ufs-mcq path of kernel, MCQ
emulated device would be a good choice to go with.

The first patch added newly introduced fields in UFSHCI 4.0 to support
MCQ.  The other one made the actual changes for MCQ.

Please review.

Thanks,

Minwoo Im (2):
  hw/ufs: Update MCQ-related fields to block/ufs.h
  hw/ufs: Add support MCQ of UFSHCI 4.0

 hw/ufs/trace-events |  17 ++
 hw/ufs/ufs.c| 474 ++--
 hw/ufs/ufs.h|  98 -
 include/block/ufs.h | 131 +++-
 4 files changed, 698 insertions(+), 22 deletions(-)

-- 
2.34.1

[PATCH v3 4/4] hw/nvme: Expand VI/VQ resource to uint32

2024-05-08 Thread Minwoo Im

From: Minwoo Im 

VI and VQ resources cover queue resources in each VFs in SR-IOV.
Current maximum I/O queue pair size is 0x, we can expand them to
cover the full number of I/O queue pairs.

This patch also fixed Identify Secondary Controller List overflow due to
expand of number of secondary controllers.

Signed-off-by: Minwoo Im 
Reviewed-by: Klaus Jensen 
---
 hw/nvme/ctrl.c | 8 
 hw/nvme/nvme.h | 4 ++--
 2 files changed, 6 insertions(+), 6 deletions(-)

diff --git a/hw/nvme/ctrl.c b/hw/nvme/ctrl.c
index 8db6828ab2a9..5a94f47b1cf1 100644
--- a/hw/nvme/ctrl.c
+++ b/hw/nvme/ctrl.c
@@ -8460,10 +8460,10 @@ static Property nvme_props[] = {
params.sriov_vq_flexible, 0),
 DEFINE_PROP_UINT16("sriov_vi_flexible", NvmeCtrl,
params.sriov_vi_flexible, 0),
-DEFINE_PROP_UINT8("sriov_max_vi_per_vf", NvmeCtrl,
-  params.sriov_max_vi_per_vf, 0),
-DEFINE_PROP_UINT8("sriov_max_vq_per_vf", NvmeCtrl,
-  params.sriov_max_vq_per_vf, 0),
+DEFINE_PROP_UINT32("sriov_max_vi_per_vf", NvmeCtrl,
+   params.sriov_max_vi_per_vf, 0),
+DEFINE_PROP_UINT32("sriov_max_vq_per_vf", NvmeCtrl,
+   params.sriov_max_vq_per_vf, 0),
 DEFINE_PROP_BOOL("msix-exclusive-bar", NvmeCtrl, params.msix_exclusive_bar,
  false),
 DEFINE_PROP_END_OF_LIST(),
diff --git a/hw/nvme/nvme.h b/hw/nvme/nvme.h
index cc6b4a3a64c2..aa708725c875 100644
--- a/hw/nvme/nvme.h
+++ b/hw/nvme/nvme.h
@@ -534,8 +534,8 @@ typedef struct NvmeParams {
 uint32_t  sriov_max_vfs;
 uint16_t sriov_vq_flexible;
 uint16_t sriov_vi_flexible;
-uint8_t  sriov_max_vq_per_vf;
-uint8_t  sriov_max_vi_per_vf;
+uint32_t  sriov_max_vq_per_vf;
+uint32_t  sriov_max_vi_per_vf;
 bool msix_exclusive_bar;
 } NvmeParams;
 
-- 
2.34.1

[PATCH v3 2/4] hw/nvme: separate identify data for sec. ctrl list

2024-05-08 Thread Minwoo Im

From: Minwoo Im 

Secondary controller list for virtualization has been managed by
Identify Secondary Controller List data structure with NvmeSecCtrlList
where up to 127 secondary controller entries can be managed.  The
problem hasn't arisen so far because NVME_MAX_VFS has been 127.

This patch separated identify data itself from the actual secondary
controller list managed by controller to support more than 127 secondary
controllers with the following patch.  This patch reused
NvmeSecCtrlEntry structure to manage all the possible secondary
controllers, and copy entries to identify data structure when the
command comes in.

Signed-off-by: Minwoo Im 
---
 hw/nvme/ctrl.c   | 21 ++---
 hw/nvme/nvme.h   | 14 --
 hw/nvme/subsys.c |  8 
 3 files changed, 22 insertions(+), 21 deletions(-)

diff --git a/hw/nvme/ctrl.c b/hw/nvme/ctrl.c
index 18672f66193f..7cf1e8e384b7 100644
--- a/hw/nvme/ctrl.c
+++ b/hw/nvme/ctrl.c
@@ -219,7 +219,6 @@
 #define NVME_TEMPERATURE_CRITICAL 0x175
 #define NVME_NUM_FW_SLOTS 1
 #define NVME_DEFAULT_MAX_ZA_SIZE (128 * KiB)
-#define NVME_MAX_VFS 127
 #define NVME_VF_RES_GRANULARITY 1
 #define NVME_VF_OFFSET 0x1
 #define NVME_VF_STRIDE 1
@@ -5480,14 +5479,14 @@ static uint16_t nvme_identify_sec_ctrl_list(NvmeCtrl 
*n, NvmeRequest *req)
 NvmeIdentify *c = (NvmeIdentify *)>cmd;
 uint16_t pri_ctrl_id = le16_to_cpu(n->pri_ctrl_cap.cntlid);
 uint16_t min_id = le16_to_cpu(c->ctrlid);
-uint8_t num_sec_ctrl = n->sec_ctrl_list.numcntl;
+uint8_t num_sec_ctrl = n->nr_sec_ctrls;
 NvmeSecCtrlList list = {0};
 uint8_t i;
 
 for (i = 0; i < num_sec_ctrl; i++) {
-if (n->sec_ctrl_list.sec[i].scid >= min_id) {
-list.numcntl = num_sec_ctrl - i;
-memcpy(, n->sec_ctrl_list.sec + i,
+if (n->sec_ctrl_list[i].scid >= min_id) {
+list.numcntl = MIN(num_sec_ctrl - i, 127);
+memcpy(, n->sec_ctrl_list + i,
list.numcntl * sizeof(NvmeSecCtrlEntry));
 break;
 }
@@ -7144,8 +7143,8 @@ static void nvme_ctrl_reset(NvmeCtrl *n, NvmeResetType 
rst)
 
 if (n->params.sriov_max_vfs) {
 if (!pci_is_vf(pci_dev)) {
-for (i = 0; i < n->sec_ctrl_list.numcntl; i++) {
-sctrl = >sec_ctrl_list.sec[i];
+for (i = 0; i < n->nr_sec_ctrls; i++) {
+sctrl = >sec_ctrl_list[i];
 nvme_virt_set_state(n, le16_to_cpu(sctrl->scid), false);
 }
 }
@@ -7934,7 +7933,7 @@ static bool nvme_check_params(NvmeCtrl *n, Error **errp)
 static void nvme_init_state(NvmeCtrl *n)
 {
 NvmePriCtrlCap *cap = >pri_ctrl_cap;
-NvmeSecCtrlList *list = >sec_ctrl_list;
+NvmeSecCtrlEntry *list = n->sec_ctrl_list;
 NvmeSecCtrlEntry *sctrl;
 PCIDevice *pci = PCI_DEVICE(n);
 uint8_t max_vfs;
@@ -7959,9 +7958,9 @@ static void nvme_init_state(NvmeCtrl *n)
 n->aer_reqs = g_new0(NvmeRequest *, n->params.aerl + 1);
 QTAILQ_INIT(>aer_queue);
 
-list->numcntl = max_vfs;
+n->nr_sec_ctrls = max_vfs;
 for (i = 0; i < max_vfs; i++) {
-sctrl = >sec[i];
+sctrl = [i];
 sctrl->pcid = cpu_to_le16(n->cntlid);
 sctrl->vfn = cpu_to_le16(i + 1);
 }
@@ -8534,7 +8533,7 @@ static void nvme_sriov_post_write_config(PCIDevice *dev, 
uint16_t old_num_vfs)
 int i;
 
 for (i = pcie_sriov_num_vfs(dev); i < old_num_vfs; i++) {
-sctrl = >sec_ctrl_list.sec[i];
+sctrl = >sec_ctrl_list[i];
 nvme_virt_set_state(n, le16_to_cpu(sctrl->scid), false);
 }
 }
diff --git a/hw/nvme/nvme.h b/hw/nvme/nvme.h
index bed8191bd5fd..485b42c104ea 100644
--- a/hw/nvme/nvme.h
+++ b/hw/nvme/nvme.h
@@ -26,6 +26,7 @@
 
 #define NVME_MAX_CONTROLLERS 256
 #define NVME_MAX_NAMESPACES  256
+#define NVME_MAX_VFS 127
 #define NVME_EUI64_DEFAULT ((uint64_t)0x5254)
 #define NVME_FDP_MAX_EVENTS 63
 #define NVME_FDP_MAXPIDS 128
@@ -612,7 +613,8 @@ typedef struct NvmeCtrl {
 } features;
 
 NvmePriCtrlCap  pri_ctrl_cap;
-NvmeSecCtrlList sec_ctrl_list;
+uint32_t nr_sec_ctrls;
+NvmeSecCtrlEntry sec_ctrl_list[NVME_MAX_VFS];
 struct {
 uint16_tvqrfap;
 uint16_tvirfap;
@@ -662,7 +664,7 @@ static inline NvmeSecCtrlEntry *nvme_sctrl(NvmeCtrl *n)
 NvmeCtrl *pf = NVME(pcie_sriov_get_pf(pci_dev));
 
 if (pci_is_vf(pci_dev)) {
-return >sec_ctrl_list.sec[pcie_sriov_vf_number(pci_dev)];
+return >sec_ctrl_list[pcie_sriov_vf_number(pci_dev)];
 }
 
 return NULL;
@@ -671,12 +673,12 @@ static inline NvmeSecCtrlEntry *nvme_sctrl(NvmeCtrl *n)
 static inline NvmeSecCtrlEntry *nvme_sctrl_for_cntlid(NvmeCtrl *n,
   uint16_t cntlid)
 {
-NvmeSecCtrlList *list = >sec_ctrl_list;
+NvmeSecCtrlEntry *list = n->sec_ct

[PATCH v3 3/4] hw/nvme: Allocate sec-ctrl-list as a dynamic array

2024-05-08 Thread Minwoo Im

From: Minwoo Im 

To prevent further bumping up the number of maximum VF te support, this
patch allocates a dynamic array (NvmeCtrl *)->sec_ctrl_list based on
number of VF supported by sriov_max_vfs property.

Signed-off-by: Minwoo Im 
---
 hw/nvme/ctrl.c   | 8 +---
 hw/nvme/nvme.h   | 5 ++---
 hw/nvme/subsys.c | 2 ++
 3 files changed, 5 insertions(+), 10 deletions(-)

diff --git a/hw/nvme/ctrl.c b/hw/nvme/ctrl.c
index 7cf1e8e384b7..8db6828ab2a9 100644
--- a/hw/nvme/ctrl.c
+++ b/hw/nvme/ctrl.c
@@ -7863,12 +7863,6 @@ static bool nvme_check_params(NvmeCtrl *n, Error **errp)
 return false;
 }
 
-if (params->sriov_max_vfs > NVME_MAX_VFS) {
-error_setg(errp, "sriov_max_vfs must be between 0 and %d",
-   NVME_MAX_VFS);
-return false;
-}
-
 if (params->cmb_size_mb) {
 error_setg(errp, "CMB is not supported with SR-IOV");
 return false;
@@ -8461,7 +8455,7 @@ static Property nvme_props[] = {
 DEFINE_PROP_UINT8("zoned.zasl", NvmeCtrl, params.zasl, 0),
 DEFINE_PROP_BOOL("zoned.auto_transition", NvmeCtrl,
  params.auto_transition_zones, true),
-DEFINE_PROP_UINT8("sriov_max_vfs", NvmeCtrl, params.sriov_max_vfs, 0),
+DEFINE_PROP_UINT32("sriov_max_vfs", NvmeCtrl, params.sriov_max_vfs, 0),
 DEFINE_PROP_UINT16("sriov_vq_flexible", NvmeCtrl,
params.sriov_vq_flexible, 0),
 DEFINE_PROP_UINT16("sriov_vi_flexible", NvmeCtrl,
diff --git a/hw/nvme/nvme.h b/hw/nvme/nvme.h
index 485b42c104ea..cc6b4a3a64c2 100644
--- a/hw/nvme/nvme.h
+++ b/hw/nvme/nvme.h
@@ -26,7 +26,6 @@
 
 #define NVME_MAX_CONTROLLERS 256
 #define NVME_MAX_NAMESPACES  256
-#define NVME_MAX_VFS 127
 #define NVME_EUI64_DEFAULT ((uint64_t)0x5254)
 #define NVME_FDP_MAX_EVENTS 63
 #define NVME_FDP_MAXPIDS 128
@@ -532,7 +531,7 @@ typedef struct NvmeParams {
 bool auto_transition_zones;
 bool legacy_cmb;
 bool ioeventfd;
-uint8_t  sriov_max_vfs;
+uint32_t  sriov_max_vfs;
 uint16_t sriov_vq_flexible;
 uint16_t sriov_vi_flexible;
 uint8_t  sriov_max_vq_per_vf;
@@ -614,7 +613,7 @@ typedef struct NvmeCtrl {
 
 NvmePriCtrlCap  pri_ctrl_cap;
 uint32_t nr_sec_ctrls;
-NvmeSecCtrlEntry sec_ctrl_list[NVME_MAX_VFS];
+NvmeSecCtrlEntry *sec_ctrl_list;
 struct {
 uint16_tvqrfap;
 uint16_tvirfap;
diff --git a/hw/nvme/subsys.c b/hw/nvme/subsys.c
index 561ed04a5317..77deaf2c2c97 100644
--- a/hw/nvme/subsys.c
+++ b/hw/nvme/subsys.c
@@ -61,6 +61,8 @@ int nvme_subsys_register_ctrl(NvmeCtrl *n, Error **errp)
 if (pci_is_vf(>parent_obj)) {
 cntlid = le16_to_cpu(sctrl->scid);
 } else {
+n->sec_ctrl_list = g_new0(NvmeSecCtrlEntry, num_vfs);
+
 for (cntlid = 0; cntlid < ARRAY_SIZE(subsys->ctrls); cntlid++) {
 if (!subsys->ctrls[cntlid]) {
 break;
-- 
2.34.1

[PATCH v3 0/4] hw/nvme: FDP and SR-IOV enhancements

2024-05-08 Thread Minwoo Im

Hello,

This patchset has rebased on the latest master and replaced 3rd patch
to one which allocates a dynamic array for secondary controller list
based on the maximum number of VFs (sriov_max_vfs) rather than a static
size of static array as Klaus suggested.  Rest of the patchset are the
same with the previous one.

This patchset has been tested with the following simple script more than
127 VFs.

-device nvme-subsys,id=subsys0 \
-device ioh3420,id=rp2,multifunction=on,chassis=12 \
-device 
nvme,serial=foo,id=nvme0,bus=rp2,subsys=subsys0,mdts=9,msix_qsize=130,max_ioqpairs=260,sriov_max_vfs=129,sriov_vq_flexible=258,sriov_vi_flexible=129
 \

$ cat nvme-enable-vfs.sh
#!/bin/bash

nr_vfs=129

for (( i=1; i<=$nr_vfs; i++ ))
do
nvme virt-mgmt /dev/nvme0 -c $i -r 0 -a 8 -n 2
nvme virt-mgmt /dev/nvme0 -c $i -r 1 -a 8 -n 1
done

bdf=":01:00.0"
sysfs="/sys/bus/pci/devices/$bdf"
nvme="/sys/bus/pci/drivers/nvme"

echo 0 > $sysfs/sriov_drivers_autoprobe
echo $nr_vfs > $sysfs/sriov_numvfs

for (( i=1; i<=$nr_vfs; i++ ))
do
nvme virt-mgmt /dev/nvme0 -c $i -a 9

echo "nvme" > $sysfs/virtfn$(($i-1))/driver_override
bdf="$(basename $(readlink $sysfs/virtfn$(($i-1"
echo $bdf > $nvme/bind
done

Thanks,

v3:
 - Replace [3/4] patch with one allocating a dyanmic array of secondary
   controller list rather than a static array with a fixed size of
   maximum number of VF to support (Suggested by Klaus).
v2:     
 - Added [2/4] commit to fix crash due to entry overflow

Minwoo Im (4):
  hw/nvme: add Identify Endurance Group List
  hw/nvme: separate identify data for sec. ctrl list
  hw/nvme: Allocate sec-ctrl-list as a dynamic array
  hw/nvme: Expand VI/VQ resource to uint32

 hw/nvme/ctrl.c   | 59 +++-
 hw/nvme/nvme.h   | 19 +++---
 hw/nvme/subsys.c | 10 +---
 include/block/nvme.h |  1 +
 4 files changed, 54 insertions(+), 35 deletions(-)

-- 
2.34.1

[PATCH v3 1/4] hw/nvme: add Identify Endurance Group List

2024-05-08 Thread Minwoo Im

From: Minwoo Im 

Commit 73064edfb864 ("hw/nvme: flexible data placement emulation")
intorudced NVMe FDP feature to nvme-subsys and nvme-ctrl with a
single endurance group #1 supported.  This means that controller should
return proper identify data to host with Identify Endurance Group List
(CNS 19h).  But, yes, only just for the endurance group #1.  This patch
allows host applications to ask for which endurance group is available
and utilize FDP through that endurance group.

Signed-off-by: Minwoo Im 
Reviewed-by: Klaus Jensen 
---
 hw/nvme/ctrl.c   | 22 ++
 include/block/nvme.h |  1 +
 2 files changed, 23 insertions(+)

diff --git a/hw/nvme/ctrl.c b/hw/nvme/ctrl.c
index 127c3d238346..18672f66193f 100644
--- a/hw/nvme/ctrl.c
+++ b/hw/nvme/ctrl.c
@@ -5629,6 +5629,26 @@ static uint16_t nvme_identify_nslist_csi(NvmeCtrl *n, 
NvmeRequest *req,
 return nvme_c2h(n, list, data_len, req);
 }
 
+static uint16_t nvme_endurance_group_list(NvmeCtrl *n, NvmeRequest *req)
+{
+uint16_t list[NVME_CONTROLLER_LIST_SIZE] = {};
+uint16_t *nr_ids = [0];
+uint16_t *ids = [1];
+uint16_t endgid = le32_to_cpu(req->cmd.cdw11) & 0x;
+
+/*
+ * The current nvme-subsys only supports Endurance Group #1.
+ */
+if (!endgid) {
+*nr_ids = 1;
+ids[0] = 1;
+} else {
+*nr_ids = 0;
+}
+
+return nvme_c2h(n, list, sizeof(list), req);
+}
+
 static uint16_t nvme_identify_ns_descr_list(NvmeCtrl *n, NvmeRequest *req)
 {
 NvmeNamespace *ns;
@@ -5744,6 +5764,8 @@ static uint16_t nvme_identify(NvmeCtrl *n, NvmeRequest 
*req)
 return nvme_identify_nslist(n, req, false);
 case NVME_ID_CNS_CS_NS_ACTIVE_LIST:
 return nvme_identify_nslist_csi(n, req, true);
+case NVME_ID_CNS_ENDURANCE_GROUP_LIST:
+return nvme_endurance_group_list(n, req);
 case NVME_ID_CNS_CS_NS_PRESENT_LIST:
 return nvme_identify_nslist_csi(n, req, false);
 case NVME_ID_CNS_NS_DESCR_LIST:
diff --git a/include/block/nvme.h b/include/block/nvme.h
index bb231d0b9ad0..7c77d38174a7 100644
--- a/include/block/nvme.h
+++ b/include/block/nvme.h
@@ -1074,6 +1074,7 @@ enum NvmeIdCns {
 NVME_ID_CNS_CTRL_LIST = 0x13,
 NVME_ID_CNS_PRIMARY_CTRL_CAP  = 0x14,
 NVME_ID_CNS_SECONDARY_CTRL_LIST   = 0x15,
+NVME_ID_CNS_ENDURANCE_GROUP_LIST  = 0x19,
 NVME_ID_CNS_CS_NS_PRESENT_LIST= 0x1a,
 NVME_ID_CNS_CS_NS_PRESENT = 0x1b,
 NVME_ID_CNS_IO_COMMAND_SET= 0x1c,
-- 
2.34.1

Re: [PATCH v2 3/4] hw/nvme: Support SR-IOV VFs more than 127

2024-05-07 Thread Minwoo Im

On 24-05-01 14:46:39, Klaus Jensen wrote:
> On Apr  1 04:30, Minwoo Im wrote:
> > From: Minwoo Im 
> > 
> > The number of virtual functions(VFs) supported in SR-IOV is 64k as per
> > spec.  To test a large number of MSI-X vectors mapping to CPU matrix in
> > the QEMU system, we need much more than 127 VFs.  This patch made
> > support for 256 VFs per a physical function(PF).
> > 
> 
> With patch 2 in place, shouldn't it be relatively straight forward to
> convert the static array to be dynamic and just use numvfs to size it?
> Then we won't have to add another patch when someone comes around and
> wants to bump this again ;)

Sorry for the late response here.  I will update the 3rd patch to
convert secondary controller list static array to a dynamic array with
making the max_vfs parameter to uint32.

[PATCH v2 2/4] hw/nvme: separate identify data for sec. ctrl list

2024-03-31 Thread Minwoo Im

From: Minwoo Im 

Secondary controller list for virtualization has been managed by
Identify Secondary Controller List data structure with NvmeSecCtrlList
where up to 127 secondary controller entries can be managed.  The
problem hasn't arisen so far because NVME_MAX_VFS has been 127.

This patch separated identify data itself from the actual secondary
controller list managed by controller to support more than 127 secondary
controllers with the following patch.  This patch reused
NvmeSecCtrlEntry structure to manage all the possible secondary
controllers, and copy entries to identify data structure when the
command comes in.

Signed-off-by: Minwoo Im 
---
 hw/nvme/ctrl.c   | 21 ++---
 hw/nvme/nvme.h   | 14 --
 hw/nvme/subsys.c |  8 
 3 files changed, 22 insertions(+), 21 deletions(-)

diff --git a/hw/nvme/ctrl.c b/hw/nvme/ctrl.c
index cfe53a358871..7e60bc9f2075 100644
--- a/hw/nvme/ctrl.c
+++ b/hw/nvme/ctrl.c
@@ -219,7 +219,6 @@
 #define NVME_TEMPERATURE_CRITICAL 0x175
 #define NVME_NUM_FW_SLOTS 1
 #define NVME_DEFAULT_MAX_ZA_SIZE (128 * KiB)
-#define NVME_MAX_VFS 127
 #define NVME_VF_RES_GRANULARITY 1
 #define NVME_VF_OFFSET 0x1
 #define NVME_VF_STRIDE 1
@@ -5480,14 +5479,14 @@ static uint16_t nvme_identify_sec_ctrl_list(NvmeCtrl 
*n, NvmeRequest *req)
 NvmeIdentify *c = (NvmeIdentify *)>cmd;
 uint16_t pri_ctrl_id = le16_to_cpu(n->pri_ctrl_cap.cntlid);
 uint16_t min_id = le16_to_cpu(c->ctrlid);
-uint8_t num_sec_ctrl = n->sec_ctrl_list.numcntl;
+uint8_t num_sec_ctrl = n->nr_sec_ctrls;
 NvmeSecCtrlList list = {0};
 uint8_t i;
 
 for (i = 0; i < num_sec_ctrl; i++) {
-if (n->sec_ctrl_list.sec[i].scid >= min_id) {
-list.numcntl = num_sec_ctrl - i;
-memcpy(, n->sec_ctrl_list.sec + i,
+if (n->sec_ctrl_list[i].scid >= min_id) {
+list.numcntl = MIN(num_sec_ctrl - i, 127);
+memcpy(, n->sec_ctrl_list + i,
list.numcntl * sizeof(NvmeSecCtrlEntry));
 break;
 }
@@ -7132,8 +7131,8 @@ static void nvme_ctrl_reset(NvmeCtrl *n, NvmeResetType 
rst)
 
 if (n->params.sriov_max_vfs) {
 if (!pci_is_vf(pci_dev)) {
-for (i = 0; i < n->sec_ctrl_list.numcntl; i++) {
-sctrl = >sec_ctrl_list.sec[i];
+for (i = 0; i < n->nr_sec_ctrls; i++) {
+sctrl = >sec_ctrl_list[i];
 nvme_virt_set_state(n, le16_to_cpu(sctrl->scid), false);
 }
 
@@ -7921,7 +7920,7 @@ static bool nvme_check_params(NvmeCtrl *n, Error **errp)
 static void nvme_init_state(NvmeCtrl *n)
 {
 NvmePriCtrlCap *cap = >pri_ctrl_cap;
-NvmeSecCtrlList *list = >sec_ctrl_list;
+NvmeSecCtrlEntry *list = n->sec_ctrl_list;
 NvmeSecCtrlEntry *sctrl;
 PCIDevice *pci = PCI_DEVICE(n);
 uint8_t max_vfs;
@@ -7946,9 +7945,9 @@ static void nvme_init_state(NvmeCtrl *n)
 n->aer_reqs = g_new0(NvmeRequest *, n->params.aerl + 1);
 QTAILQ_INIT(>aer_queue);
 
-list->numcntl = cpu_to_le16(max_vfs);
+n->nr_sec_ctrls = max_vfs;
 for (i = 0; i < max_vfs; i++) {
-sctrl = >sec[i];
+sctrl = [i];
 sctrl->pcid = cpu_to_le16(n->cntlid);
 sctrl->vfn = cpu_to_le16(i + 1);
 }
@@ -8505,7 +8504,7 @@ static void nvme_sriov_pre_write_ctrl(PCIDevice *dev, 
uint32_t address,
 if (!(val & PCI_SRIOV_CTRL_VFE)) {
 num_vfs = pci_get_word(dev->config + sriov_cap + PCI_SRIOV_NUM_VF);
 for (i = 0; i < num_vfs; i++) {
-sctrl = >sec_ctrl_list.sec[i];
+sctrl = >sec_ctrl_list[i];
 nvme_virt_set_state(n, le16_to_cpu(sctrl->scid), false);
 }
 }
diff --git a/hw/nvme/nvme.h b/hw/nvme/nvme.h
index 5f2ae7b28b9c..02c11d909cd1 100644
--- a/hw/nvme/nvme.h
+++ b/hw/nvme/nvme.h
@@ -26,6 +26,7 @@
 
 #define NVME_MAX_CONTROLLERS 256
 #define NVME_MAX_NAMESPACES  256
+#define NVME_MAX_VFS 127
 #define NVME_EUI64_DEFAULT ((uint64_t)0x5254)
 #define NVME_FDP_MAX_EVENTS 63
 #define NVME_FDP_MAXPIDS 128
@@ -597,7 +598,8 @@ typedef struct NvmeCtrl {
 } features;
 
 NvmePriCtrlCap  pri_ctrl_cap;
-NvmeSecCtrlList sec_ctrl_list;
+uint32_t nr_sec_ctrls;
+NvmeSecCtrlEntry sec_ctrl_list[NVME_MAX_VFS];
 struct {
 uint16_tvqrfap;
 uint16_tvirfap;
@@ -647,7 +649,7 @@ static inline NvmeSecCtrlEntry *nvme_sctrl(NvmeCtrl *n)
 NvmeCtrl *pf = NVME(pcie_sriov_get_pf(pci_dev));
 
 if (pci_is_vf(pci_dev)) {
-return >sec_ctrl_list.sec[pcie_sriov_vf_number(pci_dev)];
+return >sec_ctrl_list[pcie_sriov_vf_number(pci_dev)];
 }
 
 return NULL;
@@ -656,12 +658,12 @@ static inline NvmeSecCtrlEntry *nvme_sctrl(NvmeCtrl *n)
 static inline NvmeSecCtrlEntry *nvme_sctrl_for_cntlid(NvmeCtrl *n,

[PATCH v2 1/4] hw/nvme: add Identify Endurance Group List

2024-03-31 Thread Minwoo Im

From: Minwoo Im 

Commit 73064edfb864 ("hw/nvme: flexible data placement emulation")
intorudced NVMe FDP feature to nvme-subsys and nvme-ctrl with a
single endurance group #1 supported.  This means that controller should
return proper identify data to host with Identify Endurance Group List
(CNS 19h).  But, yes, only just for the endurance group #1.  This patch
allows host applications to ask for which endurance group is available
and utilize FDP through that endurance group.

Signed-off-by: Minwoo Im 
Reviewed-by: Klaus Jensen 
---
 hw/nvme/ctrl.c   | 22 ++
 include/block/nvme.h |  1 +
 2 files changed, 23 insertions(+)

diff --git a/hw/nvme/ctrl.c b/hw/nvme/ctrl.c
index f026245d1e9e..cfe53a358871 100644
--- a/hw/nvme/ctrl.c
+++ b/hw/nvme/ctrl.c
@@ -5629,6 +5629,26 @@ static uint16_t nvme_identify_nslist_csi(NvmeCtrl *n, 
NvmeRequest *req,
 return nvme_c2h(n, list, data_len, req);
 }
 
+static uint16_t nvme_endurance_group_list(NvmeCtrl *n, NvmeRequest *req)
+{
+uint16_t list[NVME_CONTROLLER_LIST_SIZE] = {};
+uint16_t *nr_ids = [0];
+uint16_t *ids = [1];
+uint16_t endgid = le32_to_cpu(req->cmd.cdw11) & 0x;
+
+/*
+ * The current nvme-subsys only supports Endurance Group #1.
+ */
+if (!endgid) {
+*nr_ids = 1;
+ids[0] = 1;
+} else {
+*nr_ids = 0;
+}
+
+return nvme_c2h(n, list, sizeof(list), req);
+}
+
 static uint16_t nvme_identify_ns_descr_list(NvmeCtrl *n, NvmeRequest *req)
 {
 NvmeNamespace *ns;
@@ -5732,6 +5752,8 @@ static uint16_t nvme_identify(NvmeCtrl *n, NvmeRequest 
*req)
 return nvme_identify_nslist(n, req, false);
 case NVME_ID_CNS_CS_NS_ACTIVE_LIST:
 return nvme_identify_nslist_csi(n, req, true);
+case NVME_ID_CNS_ENDURANCE_GROUP_LIST:
+return nvme_endurance_group_list(n, req);
 case NVME_ID_CNS_CS_NS_PRESENT_LIST:
 return nvme_identify_nslist_csi(n, req, false);
 case NVME_ID_CNS_NS_DESCR_LIST:
diff --git a/include/block/nvme.h b/include/block/nvme.h
index bb231d0b9ad0..7c77d38174a7 100644
--- a/include/block/nvme.h
+++ b/include/block/nvme.h
@@ -1074,6 +1074,7 @@ enum NvmeIdCns {
 NVME_ID_CNS_CTRL_LIST = 0x13,
 NVME_ID_CNS_PRIMARY_CTRL_CAP  = 0x14,
 NVME_ID_CNS_SECONDARY_CTRL_LIST   = 0x15,
+NVME_ID_CNS_ENDURANCE_GROUP_LIST  = 0x19,
 NVME_ID_CNS_CS_NS_PRESENT_LIST= 0x1a,
 NVME_ID_CNS_CS_NS_PRESENT = 0x1b,
 NVME_ID_CNS_IO_COMMAND_SET= 0x1c,
-- 
2.34.1

[PATCH v2 4/4] hw/nvme: Expand VI/VQ resource to uint32

2024-03-31 Thread Minwoo Im

From: Minwoo Im 

VI and VQ resources cover queue resources in each VFs in SR-IOV.
Current maximum I/O queue pair size is 0x, we can expand them to
cover the full number of I/O queue pairs.

This patch also fixed Identify Secondary Controller List overflow due to
expand of number of secondary controllers.

Signed-off-by: Minwoo Im 
Reviewed-by: Klaus Jensen 
---
 hw/nvme/ctrl.c | 8 
 hw/nvme/nvme.h | 4 ++--
 2 files changed, 6 insertions(+), 6 deletions(-)

diff --git a/hw/nvme/ctrl.c b/hw/nvme/ctrl.c
index 893d4e96656b..893afae29336 100644
--- a/hw/nvme/ctrl.c
+++ b/hw/nvme/ctrl.c
@@ -8429,10 +8429,10 @@ static Property nvme_props[] = {
params.sriov_vq_flexible, 0),
 DEFINE_PROP_UINT16("sriov_vi_flexible", NvmeCtrl,
params.sriov_vi_flexible, 0),
-DEFINE_PROP_UINT8("sriov_max_vi_per_vf", NvmeCtrl,
-  params.sriov_max_vi_per_vf, 0),
-DEFINE_PROP_UINT8("sriov_max_vq_per_vf", NvmeCtrl,
-  params.sriov_max_vq_per_vf, 0),
+DEFINE_PROP_UINT32("sriov_max_vi_per_vf", NvmeCtrl,
+   params.sriov_max_vi_per_vf, 0),
+DEFINE_PROP_UINT32("sriov_max_vq_per_vf", NvmeCtrl,
+   params.sriov_max_vq_per_vf, 0),
 DEFINE_PROP_END_OF_LIST(),
 };
 
diff --git a/hw/nvme/nvme.h b/hw/nvme/nvme.h
index ad928c28f2c5..492617f19515 100644
--- a/hw/nvme/nvme.h
+++ b/hw/nvme/nvme.h
@@ -521,8 +521,8 @@ typedef struct NvmeParams {
 uint16_t  sriov_max_vfs;
 uint16_t sriov_vq_flexible;
 uint16_t sriov_vi_flexible;
-uint8_t  sriov_max_vq_per_vf;
-uint8_t  sriov_max_vi_per_vf;
+uint32_t  sriov_max_vq_per_vf;
+uint32_t  sriov_max_vi_per_vf;
 } NvmeParams;
 
 typedef struct NvmeCtrl {
-- 
2.34.1

[PATCH v2 3/4] hw/nvme: Support SR-IOV VFs more than 127

2024-03-31 Thread Minwoo Im

From: Minwoo Im 

The number of virtual functions(VFs) supported in SR-IOV is 64k as per
spec.  To test a large number of MSI-X vectors mapping to CPU matrix in
the QEMU system, we need much more than 127 VFs.  This patch made
support for 256 VFs per a physical function(PF).

Signed-off-by: Minwoo Im 
Reviewed-by: Klaus Jensen 
---
 hw/nvme/ctrl.c | 2 +-
 hw/nvme/nvme.h | 4 ++--
 2 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/hw/nvme/ctrl.c b/hw/nvme/ctrl.c
index 7e60bc9f2075..893d4e96656b 100644
--- a/hw/nvme/ctrl.c
+++ b/hw/nvme/ctrl.c
@@ -8424,7 +8424,7 @@ static Property nvme_props[] = {
 DEFINE_PROP_UINT8("zoned.zasl", NvmeCtrl, params.zasl, 0),
 DEFINE_PROP_BOOL("zoned.auto_transition", NvmeCtrl,
  params.auto_transition_zones, true),
-DEFINE_PROP_UINT8("sriov_max_vfs", NvmeCtrl, params.sriov_max_vfs, 0),
+DEFINE_PROP_UINT16("sriov_max_vfs", NvmeCtrl, params.sriov_max_vfs, 0),
 DEFINE_PROP_UINT16("sriov_vq_flexible", NvmeCtrl,
params.sriov_vq_flexible, 0),
 DEFINE_PROP_UINT16("sriov_vi_flexible", NvmeCtrl,
diff --git a/hw/nvme/nvme.h b/hw/nvme/nvme.h
index 02c11d909cd1..ad928c28f2c5 100644
--- a/hw/nvme/nvme.h
+++ b/hw/nvme/nvme.h
@@ -26,7 +26,7 @@
 
 #define NVME_MAX_CONTROLLERS 256
 #define NVME_MAX_NAMESPACES  256
-#define NVME_MAX_VFS 127
+#define NVME_MAX_VFS 256
 #define NVME_EUI64_DEFAULT ((uint64_t)0x5254)
 #define NVME_FDP_MAX_EVENTS 63
 #define NVME_FDP_MAXPIDS 128
@@ -518,7 +518,7 @@ typedef struct NvmeParams {
 bool auto_transition_zones;
 bool legacy_cmb;
 bool ioeventfd;
-uint8_t  sriov_max_vfs;
+uint16_t  sriov_max_vfs;
 uint16_t sriov_vq_flexible;
 uint16_t sriov_vi_flexible;
 uint8_t  sriov_max_vq_per_vf;
-- 
2.34.1

[PATCH v2 0/4] hw/nvme: FDP and SR-IOV enhancements

2024-03-31 Thread Minwoo Im

Hello,

This patch set added support for Identify Endurance Group List only just
for 'endgrpid=1' for FDP.  Along with this, the following three patches
are to support more than 127 secondary controllers for SR-IOV with VI/VQ
resources.  [2/4] separated Identify controller data structure for
secondary controller list from the actual secondary controller list
managed by the pf to support proper identify data based on the given
cntlid which is a minimum controller id to retrieve.  [3/4] and [4/4]
are actual patches increasing the number of resources of SR-IOV.

Thanks,

v2:
 - Added [2/4] commit to fix crash due to entry overflow

Minwoo Im (4):
  hw/nvme: add Identify Endurance Group List
  hw/nvme: separate identify data for sec. ctrl list
  hw/nvme: Support SR-IOV VFs more than 127
  hw/nvme: Expand VI/VQ resource to uint32

 hw/nvme/ctrl.c   | 53 +++-
 hw/nvme/nvme.h   | 20 +
 hw/nvme/subsys.c |  8 +++
 include/block/nvme.h |  1 +
 4 files changed, 53 insertions(+), 29 deletions(-)

-- 
2.34.1

Re: [PATCH] hw/nvme: fix invalid endian conversion

2024-02-24 Thread Minwoo Im

On 24-02-22 10:29:06, Klaus Jensen wrote:
> From: Klaus Jensen 
> 
> numcntl is one byte and so is max_vfs. Using cpu_to_le16 on big endian
> hosts results in numcntl being set to 0.
> 
> Fix by dropping the endian conversion.
> 
> Fixes: 746d42b13368 ("hw/nvme: Initialize capability structures for 
> primary/secondary controllers")
> Reported-by: Kevin Wolf 
> Signed-off-by: Klaus Jensen 

Reviewed-by: Minwoo Im 

Thanks,

[PATCH 0/3] hw/nvme: FDP and SR-IOV enhancements

2024-02-14 Thread Minwoo Im

Hello,

This patchset includes patches for adding Identify data for the
recently added Endurance Group (endgrpid=1) used in FDP, and patches
for increasing the maximum number of SR-IOV VF Resources to support
more resources to enable testing as recent SSDs.

Thanks,

Minwoo Im (3):
  hw/nvme: add Identify Endurance Group List
  hw/nvme: Support SR-IOV VFs more than 127
  hw/nvme: Expand VI/VQ resource to uint32

 hw/nvme/ctrl.c   | 36 +---
 hw/nvme/nvme.h   |  6 +++---
 include/block/nvme.h |  1 +
 3 files changed, 33 insertions(+), 10 deletions(-)

-- 
2.34.1

[PATCH 1/3] hw/nvme: add Identify Endurance Group List

2024-02-14 Thread Minwoo Im

From: Minwoo Im 

Commit 73064edfb864 ("hw/nvme: flexible data placement emulation")
intorudced NVMe FDP feature to nvme-subsys and nvme-ctrl with a
single endurance group #1 supported.  This means that controller should
return proper identify data to host with Identify Endurance Group List
(CNS 19h).  But, yes, only just for the endurance group #1.  This patch
allows host applications to ask for which endurance group is available
and utilize FDP through that endurance group.

Signed-off-by: Minwoo Im 
Reviewed-by: Klaus Jensen 
---
 hw/nvme/ctrl.c   | 22 ++
 include/block/nvme.h |  1 +
 2 files changed, 23 insertions(+)

diff --git a/hw/nvme/ctrl.c b/hw/nvme/ctrl.c
index f026245d1e9e..cfe53a358871 100644
--- a/hw/nvme/ctrl.c
+++ b/hw/nvme/ctrl.c
@@ -5629,6 +5629,26 @@ static uint16_t nvme_identify_nslist_csi(NvmeCtrl *n, 
NvmeRequest *req,
 return nvme_c2h(n, list, data_len, req);
 }
 
+static uint16_t nvme_endurance_group_list(NvmeCtrl *n, NvmeRequest *req)
+{
+uint16_t list[NVME_CONTROLLER_LIST_SIZE] = {};
+uint16_t *nr_ids = [0];
+uint16_t *ids = [1];
+uint16_t endgid = le32_to_cpu(req->cmd.cdw11) & 0x;
+
+/*
+ * The current nvme-subsys only supports Endurance Group #1.
+ */
+if (!endgid) {
+*nr_ids = 1;
+ids[0] = 1;
+} else {
+*nr_ids = 0;
+}
+
+return nvme_c2h(n, list, sizeof(list), req);
+}
+
 static uint16_t nvme_identify_ns_descr_list(NvmeCtrl *n, NvmeRequest *req)
 {
 NvmeNamespace *ns;
@@ -5732,6 +5752,8 @@ static uint16_t nvme_identify(NvmeCtrl *n, NvmeRequest 
*req)
 return nvme_identify_nslist(n, req, false);
 case NVME_ID_CNS_CS_NS_ACTIVE_LIST:
 return nvme_identify_nslist_csi(n, req, true);
+case NVME_ID_CNS_ENDURANCE_GROUP_LIST:
+return nvme_endurance_group_list(n, req);
 case NVME_ID_CNS_CS_NS_PRESENT_LIST:
 return nvme_identify_nslist_csi(n, req, false);
 case NVME_ID_CNS_NS_DESCR_LIST:
diff --git a/include/block/nvme.h b/include/block/nvme.h
index bb231d0b9ad0..7c77d38174a7 100644
--- a/include/block/nvme.h
+++ b/include/block/nvme.h
@@ -1074,6 +1074,7 @@ enum NvmeIdCns {
 NVME_ID_CNS_CTRL_LIST = 0x13,
 NVME_ID_CNS_PRIMARY_CTRL_CAP  = 0x14,
 NVME_ID_CNS_SECONDARY_CTRL_LIST   = 0x15,
+NVME_ID_CNS_ENDURANCE_GROUP_LIST  = 0x19,
 NVME_ID_CNS_CS_NS_PRESENT_LIST= 0x1a,
 NVME_ID_CNS_CS_NS_PRESENT = 0x1b,
 NVME_ID_CNS_IO_COMMAND_SET= 0x1c,
-- 
2.34.1

[PATCH 2/3] hw/nvme: Support SR-IOV VFs more than 127

2024-02-14 Thread Minwoo Im

From: Minwoo Im 

The number of virtual functions(VFs) supported in SR-IOV is 64k as per
spec.  To test a large number of MSI-X vectors mapping to CPU matrix in
the QEMU system, we need much more than 127 VFs.  This patch made
support for 256 VFs per a physical function(PF).

Signed-off-by: Minwoo Im 
Reviewed-by: Klaus Jensen 
---
 hw/nvme/ctrl.c | 4 ++--
 hw/nvme/nvme.h | 2 +-
 2 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/hw/nvme/ctrl.c b/hw/nvme/ctrl.c
index cfe53a358871..8198fd2d8e46 100644
--- a/hw/nvme/ctrl.c
+++ b/hw/nvme/ctrl.c
@@ -219,7 +219,7 @@
 #define NVME_TEMPERATURE_CRITICAL 0x175
 #define NVME_NUM_FW_SLOTS 1
 #define NVME_DEFAULT_MAX_ZA_SIZE (128 * KiB)
-#define NVME_MAX_VFS 127
+#define NVME_MAX_VFS 256
 #define NVME_VF_RES_GRANULARITY 1
 #define NVME_VF_OFFSET 0x1
 #define NVME_VF_STRIDE 1
@@ -8425,7 +8425,7 @@ static Property nvme_props[] = {
 DEFINE_PROP_UINT8("zoned.zasl", NvmeCtrl, params.zasl, 0),
 DEFINE_PROP_BOOL("zoned.auto_transition", NvmeCtrl,
  params.auto_transition_zones, true),
-DEFINE_PROP_UINT8("sriov_max_vfs", NvmeCtrl, params.sriov_max_vfs, 0),
+DEFINE_PROP_UINT16("sriov_max_vfs", NvmeCtrl, params.sriov_max_vfs, 0),
 DEFINE_PROP_UINT16("sriov_vq_flexible", NvmeCtrl,
params.sriov_vq_flexible, 0),
 DEFINE_PROP_UINT16("sriov_vi_flexible", NvmeCtrl,
diff --git a/hw/nvme/nvme.h b/hw/nvme/nvme.h
index 5f2ae7b28b9c..db2cda098ebd 100644
--- a/hw/nvme/nvme.h
+++ b/hw/nvme/nvme.h
@@ -517,7 +517,7 @@ typedef struct NvmeParams {
 bool auto_transition_zones;
 bool legacy_cmb;
 bool ioeventfd;
-uint8_t  sriov_max_vfs;
+uint16_t  sriov_max_vfs;
 uint16_t sriov_vq_flexible;
 uint16_t sriov_vi_flexible;
 uint8_t  sriov_max_vq_per_vf;
-- 
2.34.1

[PATCH 3/3] hw/nvme: Expand VI/VQ resource to uint32

2024-02-14 Thread Minwoo Im

From: Minwoo Im 

VI and VQ resources cover queue resources in each VFs in SR-IOV.
Current maximum I/O queue pair size is 0x, we can expand them to
cover the full number of I/O queue pairs.

This patch also fixed Identify Secondary Controller List overflow due to
expand of number of secondary controllers.

Signed-off-by: Minwoo Im 
Reviewed-by: Klaus Jensen 
---
 hw/nvme/ctrl.c | 10 +-
 hw/nvme/nvme.h |  4 ++--
 2 files changed, 7 insertions(+), 7 deletions(-)

diff --git a/hw/nvme/ctrl.c b/hw/nvme/ctrl.c
index 8198fd2d8e46..6f3fd96f7572 100644
--- a/hw/nvme/ctrl.c
+++ b/hw/nvme/ctrl.c
@@ -5486,7 +5486,7 @@ static uint16_t nvme_identify_sec_ctrl_list(NvmeCtrl *n, 
NvmeRequest *req)
 
 for (i = 0; i < num_sec_ctrl; i++) {
 if (n->sec_ctrl_list.sec[i].scid >= min_id) {
-list.numcntl = num_sec_ctrl - i;
+list.numcntl = (num_sec_ctrl - i > 127) ? 127 : num_sec_ctrl - i;
 memcpy(, n->sec_ctrl_list.sec + i,
list.numcntl * sizeof(NvmeSecCtrlEntry));
 break;
@@ -8430,10 +8430,10 @@ static Property nvme_props[] = {
params.sriov_vq_flexible, 0),
 DEFINE_PROP_UINT16("sriov_vi_flexible", NvmeCtrl,
params.sriov_vi_flexible, 0),
-DEFINE_PROP_UINT8("sriov_max_vi_per_vf", NvmeCtrl,
-  params.sriov_max_vi_per_vf, 0),
-DEFINE_PROP_UINT8("sriov_max_vq_per_vf", NvmeCtrl,
-  params.sriov_max_vq_per_vf, 0),
+DEFINE_PROP_UINT32("sriov_max_vi_per_vf", NvmeCtrl,
+   params.sriov_max_vi_per_vf, 0),
+DEFINE_PROP_UINT32("sriov_max_vq_per_vf", NvmeCtrl,
+   params.sriov_max_vq_per_vf, 0),
 DEFINE_PROP_END_OF_LIST(),
 };
 
diff --git a/hw/nvme/nvme.h b/hw/nvme/nvme.h
index db2cda098ebd..d0f4c6c9b7af 100644
--- a/hw/nvme/nvme.h
+++ b/hw/nvme/nvme.h
@@ -520,8 +520,8 @@ typedef struct NvmeParams {
 uint16_t  sriov_max_vfs;
 uint16_t sriov_vq_flexible;
 uint16_t sriov_vi_flexible;
-uint8_t  sriov_max_vq_per_vf;
-uint8_t  sriov_max_vi_per_vf;
+uint32_t  sriov_max_vq_per_vf;
+uint32_t  sriov_max_vi_per_vf;
 } NvmeParams;
 
 typedef struct NvmeCtrl {
-- 
2.34.1

RE: [PATCH v3 6/7] pcie_sriov: Reuse SR-IOV VF device instances

2024-02-13 Thread Minwoo Im

> -Original Message-
> From: qemu-block-bounces+minwoo.im.dev=gmail@nongnu.org  bounces+minwoo.im.dev=gmail@nongnu.org> On Behalf Of Akihiko Odaki
> Sent: Monday, February 12, 2024 7:21 PM
> To: Philippe Mathieu-Daudé ; Michael S. Tsirkin
> ; Marcel Apfelbaum ; Alex
> Williamson ; Cédric Le Goater ;
> Paolo Bonzini ; Daniel P. Berrangé ;
> Eduardo Habkost ; Sriram Yagnaraman
> ; Jason Wang ; Keith Busch
> ; Klaus Jensen 
> Cc: qemu-devel@nongnu.org; qemu-bl...@nongnu.org; Akihiko Odaki
> 
> Subject: [PATCH v3 6/7] pcie_sriov: Reuse SR-IOV VF device instances
>
> Disable SR-IOV VF devices by reusing code to power down PCI devices
> instead of removing them when the guest requests to disable VFs. This
> allows to realize devices and report VF realization errors at PF
> realization time.
>
> Signed-off-by: Akihiko Odaki 

Hello Akihiko,

I think this patch fixes the issue reported in [1].  The latest master branch
also causes an object-related assertion error when we enable VF(s) and disable
through sysfs over and over again (at least twice).  But this issue is also
fixed with your patch.

**
ERROR:../qom/object.c:753:object_finalize: assertion failed: (obj->parent == 
NULL)
Bail out! ERROR:../qom/object.c:753:object_finalize: assertion failed: 
(obj->parent == NULL)

Klaus,

If this patchset is applied, I think [1] can be dropped.  What do you think?

Thanks,

[1] 
https://lore.kernel.org/qemu-devel/20240109022953epcms2p54550dcfc9f831a515206513ae98e7511@epcms2p5/

Re: [PATCH] hw/nvme: fix invalid check on mcl

2024-02-08 Thread Minwoo Im

On 24-02-08 13:22:48, Klaus Jensen wrote:
> From: Klaus Jensen 
> 
> The number of logical blocks within a source range is converted into a
> 1s based number at the time of parsing. However, when verifying the copy
> length we add one again, causing the check against MCL to fail in error.
> 
> Fixes: 381ab99d8587 ("hw/nvme: check maximum copy length (MCL) for COPY")
> Signed-off-by: Klaus Jensen 

Hi Klaus,

Reviewed-by: Minwoo Im 

Thanks!

> ---
>  hw/nvme/ctrl.c | 2 +-
>  1 file changed, 1 insertion(+), 1 deletion(-)
> 
> diff --git a/hw/nvme/ctrl.c b/hw/nvme/ctrl.c
> index f026245d1e9e..05c667158a3a 100644
> --- a/hw/nvme/ctrl.c
> +++ b/hw/nvme/ctrl.c
> @@ -2855,7 +2855,7 @@ static inline uint16_t 
> nvme_check_copy_mcl(NvmeNamespace *ns,
>  uint32_t nlb;
>  nvme_copy_source_range_parse(iocb->ranges, idx, iocb->format, NULL,
>   , NULL, NULL, NULL);
> -copy_len += nlb + 1;
> +copy_len += nlb;
>  }
>  
>  if (copy_len > ns->id_ns.mcl) {
> 
> ---
> base-commit: 39a6e4f87e7b75a45b08d6dc8b8b7c2954c87440
> change-id: 20240208-fix-copy-mcl-check-3a6d95327154
> 
> Best regards,
> -- 
> Klaus Jensen 
> 
>

hw: nvme: Separate 'serial' property for VFs

2024-01-08 Thread Minwoo Im

Currently, when a VF is created, it uses the 'params' object of the PF
as it is. In other words, the 'params.serial' string memory area is
also shared. In this situation, if the VF is removed from the system,
the PF's 'params.serial' object is released with object_finalize()
followed by object_property_del_all() which release the memory for
'serial' property. If that happens, the next VF created will inherit
a serial from a corrupted memory area.

If this happens, an error will occur when comparing subsys->serial and
n->params.serial in the nvme_subsys_register_ctrl() function.

Cc: qemu-sta...@nongnu.org
Fixes: 44c2c09488db ("hw/nvme: Add support for SR-IOV")
Signed-off-by: Minwoo Im 
---
 hw/nvme/ctrl.c | 8 +++-
 1 file changed, 7 insertions(+), 1 deletion(-)

diff --git a/hw/nvme/ctrl.c b/hw/nvme/ctrl.c
index f026245d1e..a0ba3529cd 100644
--- a/hw/nvme/ctrl.c
+++ b/hw/nvme/ctrl.c
@@ -8309,9 +8309,15 @@ static void nvme_realize(PCIDevice *pci_dev, Error 
**errp)
 if (pci_is_vf(pci_dev)) {
 /*
  * VFs derive settings from the parent. PF's lifespan exceeds
- * that of VF's, so it's safe to share params.serial.
+ * that of VF's.
  */
 memcpy(>params, >params, sizeof(NvmeParams));
+
+/*
+ * Set PF's serial value to a new string memory to prevent 'serial'
+ * property object release of PF when a VF is removed from the system.
+ */
+n->params.serial = g_strdup(pn->params.serial);
 n->subsys = pn->subsys;
 }
 
-- 
2.34.1

Re: [PATCH] nvme: remove constant argument to tracepoint

2023-04-17 Thread Minwoo Im

On 23-03-30 14:44:27, Paolo Bonzini wrote:
> The last argument to -pci_nvme_err_startfail_virt_state is always "OFFLINE"
> due to the enclosing "if" condition requiring !sctrl->scs.  Reported by
> Coverity.
> 
> Signed-off-by: Paolo Bonzini 

Reviewed-by: Minwoo Im

[PATCH RESEND 0/2] hw/nvme: COPY fixes

2023-04-17 Thread Minwoo Im

Fix status code overwritten issue in the COPY command and a trivial
patch to check the Maximum Copy Length (MCL) for COPY command.

Minwoo Im (2):
  hw/nvme: consider COPY command in nvme_aio_err
  hw/nvme: check maximum copy length (MCL) for COPY

 hw/nvme/ctrl.c | 25 +
 1 file changed, 25 insertions(+)

-- 
2.34.1

[PATCH RESEND 1/2] hw/nvme: consider COPY command in nvme_aio_err

2023-04-17 Thread Minwoo Im

From: Minwoo Im 

If we don't have NVME_CMD_COPY consideration in the switch statement in
nvme_aio_err(), it will go to have NVME_INTERNAL_DEV_ERROR and
`req->status` will be ovewritten to it.  During the aio context, it
might set the NVMe status field like NVME_CMD_SIZE_LIMIT, but it's
overwritten in the nvme_aio_err().

Add consideration for the NVME_CMD_COPY not to overwrite the status at
the end of the function.

Signed-off-by: Minwoo Im 
Reviewed-by: Klaus Jensen 
---
 hw/nvme/ctrl.c | 1 +
 1 file changed, 1 insertion(+)

diff --git a/hw/nvme/ctrl.c b/hw/nvme/ctrl.c
index 8b7be14209..754f91e220 100644
--- a/hw/nvme/ctrl.c
+++ b/hw/nvme/ctrl.c
@@ -1748,6 +1748,7 @@ static void nvme_aio_err(NvmeRequest *req, int ret)
 case NVME_CMD_WRITE:
 case NVME_CMD_WRITE_ZEROES:
 case NVME_CMD_ZONE_APPEND:
+case NVME_CMD_COPY:
 status = NVME_WRITE_FAULT;
 break;
 default:
-- 
2.34.1

[PATCH RESEND 2/2] hw/nvme: check maximum copy length (MCL) for COPY

2023-04-17 Thread Minwoo Im

From: Minwoo Im 

MCL(Maximum Copy Length) in the Identify Namespace data structure limits
the number of LBAs to be copied inside of the controller.  We've not
checked it at all, so added the check with returning the proper error
status.

Signed-off-by: Minwoo Im 
Reviewed-by: Klaus Jensen 
---
 hw/nvme/ctrl.c | 24 
 1 file changed, 24 insertions(+)

diff --git a/hw/nvme/ctrl.c b/hw/nvme/ctrl.c
index 754f91e220..9a363ec219 100644
--- a/hw/nvme/ctrl.c
+++ b/hw/nvme/ctrl.c
@@ -2845,6 +2845,25 @@ static void nvme_copy_source_range_parse(void *ranges, 
int idx, uint8_t format,
 }
 }
 
+static inline uint16_t nvme_check_copy_mcl(NvmeNamespace *ns,
+   NvmeCopyAIOCB *iocb, uint16_t nr)
+{
+uint32_t copy_len = 0;
+
+for (int idx = 0; idx < nr; idx++) {
+uint32_t nlb;
+nvme_copy_source_range_parse(iocb->ranges, idx, iocb->format, NULL,
+ , NULL, NULL, NULL);
+copy_len += nlb + 1;
+}
+
+if (copy_len > ns->id_ns.mcl) {
+return NVME_CMD_SIZE_LIMIT | NVME_DNR;
+}
+
+return NVME_SUCCESS;
+}
+
 static void nvme_copy_out_completed_cb(void *opaque, int ret)
 {
 NvmeCopyAIOCB *iocb = opaque;
@@ -3157,6 +3176,11 @@ static uint16_t nvme_copy(NvmeCtrl *n, NvmeRequest *req)
 }
 }
 
+status = nvme_check_copy_mcl(ns, iocb, nr);
+if (status) {
+goto invalid;
+}
+
 iocb->req = req;
 iocb->ret = 0;
 iocb->nr = nr;
-- 
2.34.1

[PATCH RESEND 0/2] hw/nvme: COPY fixes

2023-04-17 Thread Minwoo Im

Fix status code overwritten issue in the COPY command and a trivial
patch to check the Maximum Copy Length (MCL) for COPY command.

Minwoo Im (2):
  hw/nvme: consider COPY command in nvme_aio_err
  hw/nvme: check maximum copy length (MCL) for COPY

 hw/nvme/ctrl.c | 25 +
 1 file changed, 25 insertions(+)

-- 
2.34.1

[PATCH] hw/nvme: add comment for nvme-ns properties

2023-04-17 Thread Minwoo Im

From: Minwoo Im 

Add more comments of existing properties for nvme-ns device.

Signed-off-by: Minwoo Im 
Reviewed-by: Klaus Jensen 
---
 hw/nvme/ctrl.c | 9 -
 1 file changed, 8 insertions(+), 1 deletion(-)

diff --git a/hw/nvme/ctrl.c b/hw/nvme/ctrl.c
index 8b7be14209..87c07f5dbb 100644
--- a/hw/nvme/ctrl.c
+++ b/hw/nvme/ctrl.c
@@ -43,7 +43,14 @@
  *  subsys=
  *  -device nvme-ns,drive=,bus=,nsid=,\
  *  zoned=, \
- *  subsys=,detached=
+ *  subsys=,shared=, \
+ *  detached=, \
+ *  zoned.zone_size=, \
+ *  zoned.zone_capacity=, \
+ *  zoned.descr_ext_size=, \
+ *  zoned.max_active=, \
+ *  zoned.max_open=, \
+ *  zoned.cross_read=
  *
  * Note cmb_size_mb denotes size of CMB in MB. CMB is assumed to be at
  * offset 0 in BAR2 and supports only WDS, RDS and SQS for now. By default, the
-- 
2.34.1

[PATCH 2/2] hw/nvme: check maximum copy length (MCL) for COPY

2023-03-24 Thread Minwoo Im

MCL(Maximum Copy Length) in the Identify Namespace data structure limits
the number of LBAs to be copied inside of the controller.  We've not
checked it at all, so added the check with returning the proper error
status.

Signed-off-by: Minwoo Im 
Reviewed-by: Klaus Jensen 
---
 hw/nvme/ctrl.c | 24 
 1 file changed, 24 insertions(+)

diff --git a/hw/nvme/ctrl.c b/hw/nvme/ctrl.c
index fef5079a71..b490bf4d3e 100644
--- a/hw/nvme/ctrl.c
+++ b/hw/nvme/ctrl.c
@@ -2845,6 +2845,25 @@ static void nvme_copy_source_range_parse(void *ranges, 
int idx, uint8_t format,
 }
 }
 
+static inline uint16_t nvme_check_copy_mcl(NvmeNamespace *ns,
+   NvmeCopyAIOCB *iocb, uint16_t nr)
+{
+uint32_t copy_len = 0;
+
+for (int idx = 0; idx < nr; idx++) {
+uint32_t nlb;
+nvme_copy_source_range_parse(iocb->ranges, idx, iocb->format, NULL,
+ , NULL, NULL, NULL);
+copy_len += nlb + 1;
+}
+
+if (copy_len > ns->id_ns.mcl) {
+return NVME_CMD_SIZE_LIMIT | NVME_DNR;
+}
+
+return NVME_SUCCESS;
+}
+
 static void nvme_copy_out_completed_cb(void *opaque, int ret)
 {
 NvmeCopyAIOCB *iocb = opaque;
@@ -3157,6 +3176,11 @@ static uint16_t nvme_copy(NvmeCtrl *n, NvmeRequest *req)
 }
 }
 
+status = nvme_check_copy_mcl(ns, iocb, nr);
+if (status) {
+goto invalid;
+}
+
 iocb->req = req;
 iocb->ret = 0;
 iocb->nr = nr;
-- 
2.34.1

RE: [PATCH] vfio/pci: add support for VF token

2023-03-24 Thread Minwoo Im




> -Original Message-
> From: qemu-devel-bounces+minwoo.im=samsung@nongnu.org  bounces+minwoo.im=samsung@nongnu.org> On Behalf Of Alex Williamson
> Sent: Friday, March 24, 2023 3:46 AM
> To: Minwoo Im 
> Cc: Cédric Le Goater ; qemu-devel@nongnu.org; SSDR Gost Dev
> ; Klaus Birkelund Jensen 
> Subject: Re: [PATCH] vfio/pci: add support for VF token
> 
> On Thu, 23 Mar 2023 06:19:45 +0900
> Minwoo Im  wrote:
> 
> > > On Mon, 20 Mar 2023 11:03:40 +0100
> > > Cédric Le Goater  wrote:
> > >
> > > > On 3/20/23 08:35, Minwoo Im wrote:
> > > > > VF token was introduced [1] to kernel vfio-pci along with SR-IOV
> > > > > support [2].  This patch adds support VF token among PF and VF(s). To
> > > > > passthu PCIe VF to a VM, kernel >= v5.7 needs this.
> > > > >
> > > > > It can be configured with UUID like:
> > > > >
> > > > >-device vfio-pci,host=:BB:DD:F,vf-token=,...
> > > > >
> > > > > [1] https://lore.kernel.org/linux-
> > > pci/158396393244.5601.10297430724964025753.st...@gimli.home/
> > > > > [2] https://lore.kernel.org/linux-
> > > pci/158396044753.5601.14804870681174789709.st...@gimli.home/
> > > > >
> > > > > Cc: Alex Williamson 
> > > > > Signed-off-by: Minwoo Im 
> > > > > Reviewed-by: Klaus Jensen 
> > > > > ---
> > > > >   hw/vfio/pci.c | 13 -
> > > > >   hw/vfio/pci.h |  1 +
> > > > >   2 files changed, 13 insertions(+), 1 deletion(-)
> > > > >
> > > > > diff --git a/hw/vfio/pci.c b/hw/vfio/pci.c
> > > > > index ec9a854361..cf27f28936 100644
> > > > > --- a/hw/vfio/pci.c
> > > > > +++ b/hw/vfio/pci.c
> > > > > @@ -2856,6 +2856,8 @@ static void vfio_realize(PCIDevice *pdev, Error
> **errp)
> > > > >   int groupid;
> > > > >   int i, ret;
> > > > >   bool is_mdev;
> > > > > +char uuid[UUID_FMT_LEN];
> > > > > +char *name;
> > > > >
> > > > >   if (!vbasedev->sysfsdev) {
> > > > >   if (!(~vdev->host.domain || ~vdev->host.bus ||
> > > > > @@ -2936,7 +2938,15 @@ static void vfio_realize(PCIDevice *pdev, Error
> **errp)
> > > > >   goto error;
> > > > >   }
> > > > >
> > > > > -ret = vfio_get_device(group, vbasedev->name, vbasedev, errp);
> > > > > +if (!qemu_uuid_is_null(>vf_token)) {
> > > > > +qemu_uuid_unparse(>vf_token, uuid);
> > > > > +name = g_strdup_printf("%s vf_token=%s", vbasedev->name, 
> > > > > uuid);
> > > > > +} else {
> > > > > +name = vbasedev->name;
> > > > > +}
> > > > > +
> > > > > +ret = vfio_get_device(group, name, vbasedev, errp);
> > > > > +g_free(name);
> > > > >   if (ret) {
> > > > >   vfio_put_group(group);
> > > > >   goto error;
> > > >
> > > > Shouldn't we set the VF token in the kernel also ? See this QEMU
> implementation
> > > >
> > > >https://lore.kernel.org/lkml/20200204161737.34696...@w520.home/
> > > >
> > > > May be I misunderstood.
> > > >
> > >
> > > I think you're referring to the part there that calls
> > > VFIO_DEVICE_FEATURE in order to set a VF token.  I don't think that's
> > > necessarily applicable here.  I believe this patch is only trying to
> > > make it so that QEMU can consume a VF associated with a PF owned by a
> > > userspace vfio driver, ie. not QEMU.
> >
> > Yes, that's what this patch exactly does.
> >
> > >
> > > Setting the VF token is only relevant to PFs, which would require
> > > significantly more SR-IOV infrastructure in QEMU than sketched out in
> > > that proof-of-concept patch.  Even if we did have a QEMU owned PF where
> > > we wanted to generate VFs, the token we use in that case would likely
> > > need to be kept private to QEMU, not passed on the command line.
> > > Thanks,
> >
> > Can we also take a command line property for the PF for that case that
> > QEMU owns a PF?  I think the one who wants to make QEMU owns PF or VF
> > should know the VF token.  If I've missed anything, please let me know.
> 
> IIRC, the only case where a VF token is required for a PF is if there
> are existing VFs in use.  Opening the PF would then require a token
> matching the VFs.  In general though, if the PF is owned by QEMU, the
> VF token should likely be an internal secret to QEMU.  Configuring the
> PF device with a token suggests that VFs could be created and bound to
> other userspace drivers outside of the control of the QEMU instance
> that owns the PF.  Therefore I would not suggest adding the ability to
> set the VF token for a PF without a strong use case in-hand, an
> certainly not when QEMU doesn't expose SR-IOV support to be able to
> manage VFs itself.  Thanks,
> 
> Alex
> 

Thanks for the explanation!

[PATCH 1/2] hw/nvme: consider COPY command in nvme_aio_err

2023-03-24 Thread Minwoo Im

If we don't have NVME_CMD_COPY consideration in the switch statement in
nvme_aio_err(), it will go to have NVME_INTERNAL_DEV_ERROR and
`req->status` will be ovewritten to it.  During the aio context, it
might set the NVMe status field like NVME_CMD_SIZE_LIMIT, but it's
overwritten in the nvme_aio_err().

Add consideration for the NVME_CMD_COPY not to overwrite the status at
the end of the function.

Signed-off-by: Minwoo Im 
Reviewed-by: Klaus Jensen 
---
 hw/nvme/ctrl.c | 1 +
 1 file changed, 1 insertion(+)

diff --git a/hw/nvme/ctrl.c b/hw/nvme/ctrl.c
index 49c1210fce..fef5079a71 100644
--- a/hw/nvme/ctrl.c
+++ b/hw/nvme/ctrl.c
@@ -1748,6 +1748,7 @@ static void nvme_aio_err(NvmeRequest *req, int ret)
 case NVME_CMD_WRITE:
 case NVME_CMD_WRITE_ZEROES:
 case NVME_CMD_ZONE_APPEND:
+case NVME_CMD_COPY:
 status = NVME_WRITE_FAULT;
 break;
 default:
-- 
2.34.1

RE: [PATCH] vfio/pci: add support for VF token

2023-03-22 Thread Minwoo Im

> On Mon, 20 Mar 2023 11:03:40 +0100
> Cédric Le Goater  wrote:
> 
> > On 3/20/23 08:35, Minwoo Im wrote:
> > > VF token was introduced [1] to kernel vfio-pci along with SR-IOV
> > > support [2].  This patch adds support VF token among PF and VF(s). To
> > > passthu PCIe VF to a VM, kernel >= v5.7 needs this.
> > >
> > > It can be configured with UUID like:
> > >
> > >-device vfio-pci,host=:BB:DD:F,vf-token=,...
> > >
> > > [1] https://lore.kernel.org/linux-
> pci/158396393244.5601.10297430724964025753.st...@gimli.home/
> > > [2] https://lore.kernel.org/linux-
> pci/158396044753.5601.14804870681174789709.st...@gimli.home/
> > >
> > > Cc: Alex Williamson 
> > > Signed-off-by: Minwoo Im 
> > > Reviewed-by: Klaus Jensen 
> > > ---
> > >   hw/vfio/pci.c | 13 -
> > >   hw/vfio/pci.h |  1 +
> > >   2 files changed, 13 insertions(+), 1 deletion(-)
> > >
> > > diff --git a/hw/vfio/pci.c b/hw/vfio/pci.c
> > > index ec9a854361..cf27f28936 100644
> > > --- a/hw/vfio/pci.c
> > > +++ b/hw/vfio/pci.c
> > > @@ -2856,6 +2856,8 @@ static void vfio_realize(PCIDevice *pdev, Error 
> > > **errp)
> > >   int groupid;
> > >   int i, ret;
> > >   bool is_mdev;
> > > +char uuid[UUID_FMT_LEN];
> > > +char *name;
> > >
> > >   if (!vbasedev->sysfsdev) {
> > >   if (!(~vdev->host.domain || ~vdev->host.bus ||
> > > @@ -2936,7 +2938,15 @@ static void vfio_realize(PCIDevice *pdev, Error 
> > > **errp)
> > >   goto error;
> > >   }
> > >
> > > -ret = vfio_get_device(group, vbasedev->name, vbasedev, errp);
> > > +if (!qemu_uuid_is_null(>vf_token)) {
> > > +qemu_uuid_unparse(>vf_token, uuid);
> > > +name = g_strdup_printf("%s vf_token=%s", vbasedev->name, uuid);
> > > +} else {
> > > +name = vbasedev->name;
> > > +}
> > > +
> > > +ret = vfio_get_device(group, name, vbasedev, errp);
> > > +g_free(name);
> > >   if (ret) {
> > >   vfio_put_group(group);
> > >   goto error;
> >
> > Shouldn't we set the VF token in the kernel also ? See this QEMU 
> > implementation
> >
> >https://lore.kernel.org/lkml/20200204161737.34696...@w520.home/
> >
> > May be I misunderstood.
> >
> 
> I think you're referring to the part there that calls
> VFIO_DEVICE_FEATURE in order to set a VF token.  I don't think that's
> necessarily applicable here.  I believe this patch is only trying to
> make it so that QEMU can consume a VF associated with a PF owned by a
> userspace vfio driver, ie. not QEMU.

Yes, that's what this patch exactly does.

> 
> Setting the VF token is only relevant to PFs, which would require
> significantly more SR-IOV infrastructure in QEMU than sketched out in
> that proof-of-concept patch.  Even if we did have a QEMU owned PF where
> we wanted to generate VFs, the token we use in that case would likely
> need to be kept private to QEMU, not passed on the command line.
> Thanks,

Can we also take a command line property for the PF for that case that
QEMU owns a PF?  I think the one who wants to make QEMU owns PF or VF
should know the VF token.  If I've missed anything, please let me know.

Thanks!

[PATCH] vfio/pci: add support for VF token

2023-03-20 Thread Minwoo Im

VF token was introduced [1] to kernel vfio-pci along with SR-IOV
support [2].  This patch adds support VF token among PF and VF(s). To
passthu PCIe VF to a VM, kernel >= v5.7 needs this.

It can be configured with UUID like:

  -device vfio-pci,host=:BB:DD:F,vf-token=,...

[1] 
https://lore.kernel.org/linux-pci/158396393244.5601.10297430724964025753.st...@gimli.home/
[2] 
https://lore.kernel.org/linux-pci/158396044753.5601.14804870681174789709.st...@gimli.home/

Cc: Alex Williamson 
Signed-off-by: Minwoo Im 
Reviewed-by: Klaus Jensen 
---
 hw/vfio/pci.c | 13 -
 hw/vfio/pci.h |  1 +
 2 files changed, 13 insertions(+), 1 deletion(-)

diff --git a/hw/vfio/pci.c b/hw/vfio/pci.c
index ec9a854361..cf27f28936 100644
--- a/hw/vfio/pci.c
+++ b/hw/vfio/pci.c
@@ -2856,6 +2856,8 @@ static void vfio_realize(PCIDevice *pdev, Error **errp)
 int groupid;
 int i, ret;
 bool is_mdev;
+char uuid[UUID_FMT_LEN];
+char *name;
 
 if (!vbasedev->sysfsdev) {
 if (!(~vdev->host.domain || ~vdev->host.bus ||
@@ -2936,7 +2938,15 @@ static void vfio_realize(PCIDevice *pdev, Error **errp)
 goto error;
 }
 
-ret = vfio_get_device(group, vbasedev->name, vbasedev, errp);
+if (!qemu_uuid_is_null(>vf_token)) {
+qemu_uuid_unparse(>vf_token, uuid);
+name = g_strdup_printf("%s vf_token=%s", vbasedev->name, uuid);
+} else {
+name = vbasedev->name;
+}
+
+ret = vfio_get_device(group, name, vbasedev, errp);
+g_free(name);
 if (ret) {
 vfio_put_group(group);
 goto error;
@@ -3268,6 +3278,7 @@ static void vfio_instance_init(Object *obj)
 
 static Property vfio_pci_dev_properties[] = {
 DEFINE_PROP_PCI_HOST_DEVADDR("host", VFIOPCIDevice, host),
+DEFINE_PROP_UUID_NODEFAULT("vf-token", VFIOPCIDevice, vf_token),
 DEFINE_PROP_STRING("sysfsdev", VFIOPCIDevice, vbasedev.sysfsdev),
 DEFINE_PROP_ON_OFF_AUTO("x-pre-copy-dirty-page-tracking", VFIOPCIDevice,
 vbasedev.pre_copy_dirty_page_tracking,
diff --git a/hw/vfio/pci.h b/hw/vfio/pci.h
index 177abcc8fb..2674476d6c 100644
--- a/hw/vfio/pci.h
+++ b/hw/vfio/pci.h
@@ -137,6 +137,7 @@ struct VFIOPCIDevice {
 VFIOVGA *vga; /* 0xa, 0x3b0, 0x3c0 */
 void *igd_opregion;
 PCIHostDeviceAddress host;
+QemuUUID vf_token;
 EventNotifier err_notifier;
 EventNotifier req_notifier;
 int (*resetfn)(struct VFIOPCIDevice *);
-- 
2.34.1

Re: [PATCH] hw/block/nvme: slba equal to nsze is out of bounds if nlb is 1-based

2021-04-09 Thread Minwoo Im

On 21-04-09 14:36:19, Klaus Jensen wrote:
> On Apr  9 21:31, Minwoo Im wrote:
> > On 21-04-09 13:55:01, Klaus Jensen wrote:
> > > On Apr  9 20:05, Minwoo Im wrote:
> > > > On 21-04-09 13:14:02, Gollu Appalanaidu wrote:
> > > > > NSZE is the total size of the namespace in logical blocks. So the max
> > > > > addressable logical block is NLB minus 1. So your starting logical
> > > > > block is equal to NSZE it is a out of range.
> > > > >
> > > > > Signed-off-by: Gollu Appalanaidu 
> > > > > ---
> > > > >  hw/block/nvme.c | 2 +-
> > > > >  1 file changed, 1 insertion(+), 1 deletion(-)
> > > > >
> > > > > diff --git a/hw/block/nvme.c b/hw/block/nvme.c
> > > > > index 953ec64729..be9edb1158 100644
> > > > > --- a/hw/block/nvme.c
> > > > > +++ b/hw/block/nvme.c
> > > > > @@ -2527,7 +2527,7 @@ static uint16_t nvme_dsm(NvmeCtrl *n, 
> > > > > NvmeRequest *req)
> > > > >  uint64_t slba = le64_to_cpu(range[i].slba);
> > > > >  uint32_t nlb = le32_to_cpu(range[i].nlb);
> > > > >
> > > > > -if (nvme_check_bounds(ns, slba, nlb)) {
> > > > > +if (nvme_check_bounds(ns, slba, nlb) || slba == 
> > > > > ns->id_ns.nsze) {
> > > >
> > > > This patch also looks like check the boundary about slba.  Should it be
> > > > also checked inside of nvme_check_bounds() ?
> > > 
> > > The catch here is that DSM is like the only command where the number of
> > > logical blocks is a 1s-based value. Otherwise we always have nlb > 0, 
> > > which
> > > means that nvme_check_bounds() will always "do the right thing".
> > > 
> > > My main gripe here is that (in my mind), by definition, a "zero length
> > > range" does not reference any LBAs at all. So how can it result in LBA Out
> > > of Range?
> > 
> > Even if this is not the LBA out of range case which is currently what
> > nvme_check_bounds() checking, but I thought the function checks the
> > bounds so that we can add one more check inside of that function like:
> > (If SLBA is 0-based or not, slba should not be nsze, isn't it ?)
> > 
> > diff --git a/hw/block/nvme.c b/hw/block/nvme.c
> > index 7244534a89e9..25a7db5ecbd8 100644
> > --- a/hw/block/nvme.c
> > +++ b/hw/block/nvme.c
> > @@ -1415,6 +1415,10 @@ static inline uint16_t 
> > nvme_check_bounds(NvmeNamespace *ns, uint64_t slba,
> > {
> > uint64_t nsze = le64_to_cpu(ns->id_ns.nsze);
> > 
> > +if (slba == nsze) {
> > +return NVME_INVALID_FIELD | NVME_DNR;
> > +}
> > +
> > if (unlikely(UINT64_MAX - slba < nlb || slba + nlb > nsze)) {
> > return NVME_LBA_RANGE | NVME_DNR;
> > }
> > 
> > Or am I missing something here ;) ?
> 
> No, not at all, it's just that this additional check is never needed for any
> other command than DSM since, as far as I remember, DSM is the only command
> with the 1s-based NLB value fuckup.
> 
> This means that nlb will always be at least 1, so slba + 1 > nsze will be
> false if slba == nsze.

Understood :)

Please have:

Reviewed-by: Minwoo Im

Re: [PATCH] hw/block/nvme: slba equal to nsze is out of bounds if nlb is 1-based

2021-04-09 Thread Minwoo Im

On 21-04-09 13:55:01, Klaus Jensen wrote:
> On Apr  9 20:05, Minwoo Im wrote:
> > On 21-04-09 13:14:02, Gollu Appalanaidu wrote:
> > > NSZE is the total size of the namespace in logical blocks. So the max
> > > addressable logical block is NLB minus 1. So your starting logical
> > > block is equal to NSZE it is a out of range.
> > > 
> > > Signed-off-by: Gollu Appalanaidu 
> > > ---
> > >  hw/block/nvme.c | 2 +-
> > >  1 file changed, 1 insertion(+), 1 deletion(-)
> > > 
> > > diff --git a/hw/block/nvme.c b/hw/block/nvme.c
> > > index 953ec64729..be9edb1158 100644
> > > --- a/hw/block/nvme.c
> > > +++ b/hw/block/nvme.c
> > > @@ -2527,7 +2527,7 @@ static uint16_t nvme_dsm(NvmeCtrl *n, NvmeRequest 
> > > *req)
> > >  uint64_t slba = le64_to_cpu(range[i].slba);
> > >  uint32_t nlb = le32_to_cpu(range[i].nlb);
> > > 
> > > -if (nvme_check_bounds(ns, slba, nlb)) {
> > > +if (nvme_check_bounds(ns, slba, nlb) || slba == 
> > > ns->id_ns.nsze) {
> > 
> > This patch also looks like check the boundary about slba.  Should it be
> > also checked inside of nvme_check_bounds() ?
> 
> The catch here is that DSM is like the only command where the number of
> logical blocks is a 1s-based value. Otherwise we always have nlb > 0, which
> means that nvme_check_bounds() will always "do the right thing".
> 
> My main gripe here is that (in my mind), by definition, a "zero length
> range" does not reference any LBAs at all. So how can it result in LBA Out
> of Range?

Even if this is not the LBA out of range case which is currently what
nvme_check_bounds() checking, but I thought the function checks the
bounds so that we can add one more check inside of that function like:
(If SLBA is 0-based or not, slba should not be nsze, isn't it ?)

diff --git a/hw/block/nvme.c b/hw/block/nvme.c
index 7244534a89e9..25a7db5ecbd8 100644
--- a/hw/block/nvme.c
+++ b/hw/block/nvme.c
@@ -1415,6 +1415,10 @@ static inline uint16_t nvme_check_bounds(NvmeNamespace 
*ns, uint64_t slba,
 {
 uint64_t nsze = le64_to_cpu(ns->id_ns.nsze);
 
+if (slba == nsze) {
+return NVME_INVALID_FIELD | NVME_DNR;
+}
+
 if (unlikely(UINT64_MAX - slba < nlb || slba + nlb > nsze)) {
 return NVME_LBA_RANGE | NVME_DNR;
 }

Or am I missing something here ;) ?

Re: [PATCH 2/2] hw/block/nvme: drain namespaces on sq deletion

2021-04-09 Thread Minwoo Im

On 21-04-08 21:37:09, Klaus Jensen wrote:
> From: Klaus Jensen 
> 
> For most commands, when issuing an AIO, the BlockAIOCB is stored in the
> NvmeRequest aiocb pointer when the AIO is issued. The main use of this
> is cancelling AIOs when deleting submission queues (it is currently not
> used for Abort).
> 
> However, some commands like Dataset Management Zone Management Send
> (zone reset) may involve more than one AIO and here the AIOs are issued
> without saving a reference to the BlockAIOCB. This is a problem since
> nvme_del_sq() will attempt to cancel outstanding AIOs, potentially with
> an invalid BlockAIOCB since the aiocb pointer is not NULL'ed when the
> request structure is recycled.
> 
> Fix this by
> 
>   1. making sure the aiocb pointer is NULL'ed when requests are recycled
>   2. only attempt to cancel the AIO if the aiocb is non-NULL
>   3. if any AIOs could not be cancelled, drain all aio as a last resort.
> 
> Fixes: dc04d25e2f3f ("hw/block/nvme: add support for the format nvm command")
> Fixes: c94973288cd9 ("hw/block/nvme: add broadcast nsid support flush 
> command")
> Fixes: e4e430b3d6ba ("hw/block/nvme: add simple copy command")
> Fixes: 5f5dc4c6a942 ("hw/block/nvme: zero out zones on reset")
> Fixes: 2605257a26b8 ("hw/block/nvme: add the dataset management command")
> Cc: Gollu Appalanaidu 
> Cc: Minwoo Im 
> Signed-off-by: Klaus Jensen 
> ---
>  hw/block/nvme.c | 23 +--
>  1 file changed, 21 insertions(+), 2 deletions(-)
> 
> diff --git a/hw/block/nvme.c b/hw/block/nvme.c
> index 94bc373260be..3c4297e38a52 100644
> --- a/hw/block/nvme.c
> +++ b/hw/block/nvme.c
> @@ -470,6 +470,7 @@ static void nvme_req_clear(NvmeRequest *req)
>  {
>  req->ns = NULL;
>  req->opaque = NULL;
> +req->aiocb = NULL;
>  memset(>cqe, 0x0, sizeof(req->cqe));
>  req->status = NVME_SUCCESS;
>  }
> @@ -3681,6 +3682,7 @@ static uint16_t nvme_del_sq(NvmeCtrl *n, NvmeRequest 
> *req)
>  NvmeSQueue *sq;
>  NvmeCQueue *cq;
>  uint16_t qid = le16_to_cpu(c->qid);
> +int nsid;

Even we don't have fully supported number of namespaces in this device
(0x), can we have this one with `uint32_t` ?

Otherwise, looks good to me.

Reviewed-by: Minwoo Im

Re: [PATCH 1/2] hw/block/nvme: store aiocb in compare

2021-04-09 Thread Minwoo Im

On 21-04-08 21:37:08, Klaus Jensen wrote:
> From: Klaus Jensen 
> 
> nvme_compare() fails to store the aiocb from the blk_aio_preadv() call.
> Fix this.
> 
> Fixes: 0a384f923f51 ("hw/block/nvme: add compare command")
> Cc: Gollu Appalanaidu 
> Signed-off-by: Klaus Jensen 

Reviewed-by: Minwoo Im

Re: [PATCH] hw/block/nvme: slba equal to nsze is out of bounds if nlb is 1-based

2021-04-09 Thread Minwoo Im

On 21-04-09 13:14:02, Gollu Appalanaidu wrote:
> NSZE is the total size of the namespace in logical blocks. So the max
> addressable logical block is NLB minus 1. So your starting logical
> block is equal to NSZE it is a out of range.
> 
> Signed-off-by: Gollu Appalanaidu 
> ---
>  hw/block/nvme.c | 2 +-
>  1 file changed, 1 insertion(+), 1 deletion(-)
> 
> diff --git a/hw/block/nvme.c b/hw/block/nvme.c
> index 953ec64729..be9edb1158 100644
> --- a/hw/block/nvme.c
> +++ b/hw/block/nvme.c
> @@ -2527,7 +2527,7 @@ static uint16_t nvme_dsm(NvmeCtrl *n, NvmeRequest *req)
>  uint64_t slba = le64_to_cpu(range[i].slba);
>  uint32_t nlb = le32_to_cpu(range[i].nlb);
>  
> -if (nvme_check_bounds(ns, slba, nlb)) {
> +if (nvme_check_bounds(ns, slba, nlb) || slba == ns->id_ns.nsze) {

This patch also looks like check the boundary about slba.  Should it be
also checked inside of nvme_check_bounds() ?

Re: [PATCH for-6.0 v2 7/8] hw/block/nvme: fix handling of private namespaces

2021-04-06 Thread Minwoo Im

On 21-04-05 19:54:51, Klaus Jensen wrote:
> From: Klaus Jensen 
> 
> Prior to this patch, if a private nvme-ns device (that is, a namespace
> that is not linked to a subsystem) is wired up to an nvme-subsys linked
> nvme controller device, the device fails to verify that the namespace id
> is unique within the subsystem. NVM Express v1.4b, Section 6.1.6 ("NSID
> and Namespace Usage") states that because the device supports Namespace
> Management, "NSIDs *shall* be unique within the NVM subsystem".
> 
> Additionally, prior to this patch, private namespaces are not known to
> the subsystem and the namespace is considered exclusive to the
> controller with which it is initially wired up to. However, this is not
> the definition of a private namespace; per Section 1.6.33 ("private
> namespace"), a private namespace is just a namespace that does not
> support multipath I/O or namespace sharing, which means "that it is only
> able to be attached to one controller at a time".
> 
> Fix this by always allocating namespaces in the subsystem (if one is
> linked to the controller), regardsless of the shared/private status of
> the namespace. Whether or not the namespace is shareable is controlled
> by a new `shared` nvme-ns parameter.
> 
> Finally, this fix allows the nvme-ns `subsys` parameter to be removed,
> since the `shared` parameter now serves the purpose of attaching the
> namespace to all controllers in the subsystem upon device realization.
> It is invalid to have an nvme-ns namespace device with a linked
> subsystem without the parent nvme controller device also being linked to
> one and since the nvme-ns devices will unconditionally be "attached" (in
> QEMU terms that is) to an nvme controller device through an NvmeBus, the
> nvme-ns namespace device can always get a reference to the subsystem of
> the controller it is explicitly (using 'bus=' parametr) or implicitly
> attaching to.
> 
> Fixes: e570768566b3 ("hw/block/nvme: support for shared namespace in 
> subsystem")
> Cc: Minwoo Im 
> Signed-off-by: Klaus Jensen 
> Reviewed-by: Gollu Appalanaidu 

Reviewed-by: Minwoo Im 

Thanks for the fix.

Re: [PATCH v5 09/13] hw/block/nvme: parameterize nvme_ns_nlbas

2021-03-16 Thread Minwoo Im

On 21-03-16 08:19:08, Klaus Jensen wrote:
> On Mar 16 15:53, Minwoo Im wrote:
> > On 21-03-10 10:53:43, Klaus Jensen wrote:
> > > From: Klaus Jensen 
> > > 
> > > Provide a more flexible nlbas helper.
> > > 
> > > Signed-off-by: Klaus Jensen 
> > > ---
> > >  hw/block/nvme-ns.h | 14 ++
> > >  1 file changed, 10 insertions(+), 4 deletions(-)
> > > 
> > > diff --git a/hw/block/nvme-ns.h b/hw/block/nvme-ns.h
> > > index 07e16880801d..34f9474a1cd1 100644
> > > --- a/hw/block/nvme-ns.h
> > > +++ b/hw/block/nvme-ns.h
> > > @@ -136,12 +136,18 @@ static inline bool nvme_ns_ext(NvmeNamespace *ns)
> > >  }
> > >  
> > >  /* calculate the number of LBAs that the namespace can accomodate */
> > > +static inline uint64_t __nvme_nlbas(size_t size, uint8_t lbads, uint16_t 
> > > ms)
> > > +{
> > > +if (ms) {
> > > +return size / ((1 << lbads) + ms);
> > > +}
> > > +
> > > +return size >> lbads;
> > > +}
> > > +
> > >  static inline uint64_t nvme_ns_nlbas(NvmeNamespace *ns)
> > >  {
> > > -if (nvme_msize(ns)) {
> > > -return ns->size / (nvme_lsize(ns) + nvme_msize(ns));
> > > -}
> > > -return ns->size >> nvme_ns_lbads(ns);
> > > +return __nvme_nlbas(ns->size, nvme_ns_lbads(ns), nvme_msize(ns));
> > >  }
> > 
> > Hmm.. I think it looks like __nvme_nlbas does the same with the
> > nvme_ns_nlbas, but flexible argument attributes.  But I think those
> > three attributes are all for ns-specific fields which is not that
> > generic so that I don't think we are going to take the helper from much
> > more general perspective with __nvme_nlbas.
> > 
> 
> This patch should be moved two patches forward in the series - it is
> used in [12/13] to check the zone geometry before the values are set on
> the namespace proper. This is also used in Format NVM to verify the
> format before formatting ("commiting" the values on the NvmeNamespace
> structure).

Checked [12/13] right before.  Thanks for pointing that out!

Reviewed-by: Minwoo Im

Re: [PATCH v5 09/13] hw/block/nvme: parameterize nvme_ns_nlbas

2021-03-16 Thread Minwoo Im

On 21-03-10 10:53:43, Klaus Jensen wrote:
> From: Klaus Jensen 
> 
> Provide a more flexible nlbas helper.
> 
> Signed-off-by: Klaus Jensen 
> ---
>  hw/block/nvme-ns.h | 14 ++
>  1 file changed, 10 insertions(+), 4 deletions(-)
> 
> diff --git a/hw/block/nvme-ns.h b/hw/block/nvme-ns.h
> index 07e16880801d..34f9474a1cd1 100644
> --- a/hw/block/nvme-ns.h
> +++ b/hw/block/nvme-ns.h
> @@ -136,12 +136,18 @@ static inline bool nvme_ns_ext(NvmeNamespace *ns)
>  }
>  
>  /* calculate the number of LBAs that the namespace can accomodate */
> +static inline uint64_t __nvme_nlbas(size_t size, uint8_t lbads, uint16_t ms)
> +{
> +if (ms) {
> +return size / ((1 << lbads) + ms);
> +}
> +
> +return size >> lbads;
> +}
> +
>  static inline uint64_t nvme_ns_nlbas(NvmeNamespace *ns)
>  {
> -if (nvme_msize(ns)) {
> -return ns->size / (nvme_lsize(ns) + nvme_msize(ns));
> -}
> -return ns->size >> nvme_ns_lbads(ns);
> +return __nvme_nlbas(ns->size, nvme_ns_lbads(ns), nvme_msize(ns));
>  }

Hmm.. I think it looks like __nvme_nlbas does the same with the
nvme_ns_nlbas, but flexible argument attributes.  But I think those
three attributes are all for ns-specific fields which is not that
generic so that I don't think we are going to take the helper from much
more general perspective with __nvme_nlbas.

Re: [PATCH v5 08/13] hw/block/nvme: pull lba format initialization

2021-03-16 Thread Minwoo Im

Reviewed-by: Minwoo Im

Re: [PATCH v5 07/13] hw/block/nvme: prefer runtime helpers instead of device parameters

2021-03-16 Thread Minwoo Im

Reviewed-by: Minwoo Im

[PATCH V4 8/8] hw/block/nvme: support Identify NS Attached Controller List

2021-03-02 Thread Minwoo Im

Support Identify command for Namespace attached controller list.  This
command handler will traverse the controller instances in the given
subsystem to figure out whether the specified nsid is attached to the
controllers or not.

The 4096bytes Identify data will return with the first entry (16bits)
indicating the number of the controller id entries.  So, the data can
hold up to 2047 entries for the controller ids.

Signed-off-by: Minwoo Im 
Tested-by: Klaus Jensen 
Reviewed-by: Klaus Jensen 
---
 hw/block/nvme.c   | 42 ++
 hw/block/trace-events |  1 +
 include/block/nvme.h  |  1 +
 3 files changed, 44 insertions(+)

diff --git a/hw/block/nvme.c b/hw/block/nvme.c
index 7e6f91923fd7..7b79b6b5265d 100644
--- a/hw/block/nvme.c
+++ b/hw/block/nvme.c
@@ -3303,6 +3303,46 @@ static uint16_t nvme_identify_ns(NvmeCtrl *n, 
NvmeRequest *req, bool active)
 return NVME_INVALID_CMD_SET | NVME_DNR;
 }
 
+static uint16_t nvme_identify_ns_attached_list(NvmeCtrl *n, NvmeRequest *req)
+{
+NvmeIdentify *c = (NvmeIdentify *)>cmd;
+uint16_t min_id = le16_to_cpu(c->ctrlid);
+uint16_t list[NVME_CONTROLLER_LIST_SIZE] = {};
+uint16_t *ids = [1];
+NvmeNamespace *ns;
+NvmeCtrl *ctrl;
+int cntlid, nr_ids = 0;
+
+trace_pci_nvme_identify_ns_attached_list(min_id);
+
+if (c->nsid == NVME_NSID_BROADCAST) {
+return NVME_INVALID_FIELD | NVME_DNR;
+}
+
+ns = nvme_subsys_ns(n->subsys, c->nsid);
+if (!ns) {
+return NVME_INVALID_FIELD | NVME_DNR;
+}
+
+for (cntlid = min_id; cntlid < ARRAY_SIZE(n->subsys->ctrls); cntlid++) {
+ctrl = nvme_subsys_ctrl(n->subsys, cntlid);
+if (!ctrl) {
+continue;
+}
+
+if (!nvme_ns_is_attached(ctrl, ns)) {
+continue;
+}
+
+ids[nr_ids++] = cntlid;
+}
+
+list[0] = nr_ids;
+
+return nvme_dma(n, (uint8_t *)list, sizeof(list),
+DMA_DIRECTION_FROM_DEVICE, req);
+}
+
 static uint16_t nvme_identify_ns_csi(NvmeCtrl *n, NvmeRequest *req,
 bool active)
 {
@@ -3502,6 +3542,8 @@ static uint16_t nvme_identify(NvmeCtrl *n, NvmeRequest 
*req)
 return nvme_identify_ns(n, req, true);
 case NVME_ID_CNS_NS_PRESENT:
 return nvme_identify_ns(n, req, false);
+case NVME_ID_CNS_NS_ATTACHED_CTRL_LIST:
+return nvme_identify_ns_attached_list(n, req);
 case NVME_ID_CNS_CS_NS:
 return nvme_identify_ns_csi(n, req, true);
 case NVME_ID_CNS_CS_NS_PRESENT:
diff --git a/hw/block/trace-events b/hw/block/trace-events
index 98d542c999e2..2628d69c7879 100644
--- a/hw/block/trace-events
+++ b/hw/block/trace-events
@@ -64,6 +64,7 @@ pci_nvme_del_cq(uint16_t cqid) "deleted completion queue, 
cqid=%"PRIu16""
 pci_nvme_identify_ctrl(void) "identify controller"
 pci_nvme_identify_ctrl_csi(uint8_t csi) "identify controller, csi=0x%"PRIx8""
 pci_nvme_identify_ns(uint32_t ns) "nsid %"PRIu32""
+pci_nvme_identify_ns_attached_list(uint16_t cntid) "cntid=%"PRIu16""
 pci_nvme_identify_ns_csi(uint32_t ns, uint8_t csi) "nsid=%"PRIu32", 
csi=0x%"PRIx8""
 pci_nvme_identify_nslist(uint32_t ns) "nsid %"PRIu32""
 pci_nvme_identify_nslist_csi(uint16_t ns, uint8_t csi) "nsid=%"PRIu16", 
csi=0x%"PRIx8""
diff --git a/include/block/nvme.h b/include/block/nvme.h
index eb0b31e949c2..b18945913927 100644
--- a/include/block/nvme.h
+++ b/include/block/nvme.h
@@ -971,6 +971,7 @@ enum NvmeIdCns {
 NVME_ID_CNS_CS_NS_ACTIVE_LIST = 0x07,
 NVME_ID_CNS_NS_PRESENT_LIST   = 0x10,
 NVME_ID_CNS_NS_PRESENT= 0x11,
+NVME_ID_CNS_NS_ATTACHED_CTRL_LIST = 0x12,
 NVME_ID_CNS_CS_NS_PRESENT_LIST= 0x1a,
 NVME_ID_CNS_CS_NS_PRESENT = 0x1b,
 NVME_ID_CNS_IO_COMMAND_SET= 0x1c,
-- 
2.27.0

[PATCH V4 5/8] hw/block/nvme: refactor nvme_select_ns_iocs

2021-03-02 Thread Minwoo Im

This patch has no functional changes.  This patch just refactored
nvme_select_ns_iocs() to iterate the attached namespaces of the
controlller and make it invoke __nvme_select_ns_iocs().

Signed-off-by: Minwoo Im 
Tested-by: Klaus Jensen 
Reviewed-by: Klaus Jensen 
---
 hw/block/nvme.c | 36 +---
 1 file changed, 21 insertions(+), 15 deletions(-)

diff --git a/hw/block/nvme.c b/hw/block/nvme.c
index 53c4d59e09a7..b18ab0ef810f 100644
--- a/hw/block/nvme.c
+++ b/hw/block/nvme.c
@@ -4000,6 +4000,25 @@ static void nvme_ctrl_shutdown(NvmeCtrl *n)
 }
 }
 
+static void __nvme_select_ns_iocs(NvmeCtrl *n, NvmeNamespace *ns)
+{
+ns->iocs = nvme_cse_iocs_none;
+switch (ns->csi) {
+case NVME_CSI_NVM:
+if (NVME_CC_CSS(n->bar.cc) != NVME_CC_CSS_ADMIN_ONLY) {
+ns->iocs = nvme_cse_iocs_nvm;
+}
+break;
+case NVME_CSI_ZONED:
+if (NVME_CC_CSS(n->bar.cc) == NVME_CC_CSS_CSI) {
+ns->iocs = nvme_cse_iocs_zoned;
+} else if (NVME_CC_CSS(n->bar.cc) == NVME_CC_CSS_NVM) {
+ns->iocs = nvme_cse_iocs_nvm;
+}
+break;
+}
+}
+
 static void nvme_select_ns_iocs(NvmeCtrl *n)
 {
 NvmeNamespace *ns;
@@ -4010,21 +4029,8 @@ static void nvme_select_ns_iocs(NvmeCtrl *n)
 if (!ns) {
 continue;
 }
-ns->iocs = nvme_cse_iocs_none;
-switch (ns->csi) {
-case NVME_CSI_NVM:
-if (NVME_CC_CSS(n->bar.cc) != NVME_CC_CSS_ADMIN_ONLY) {
-ns->iocs = nvme_cse_iocs_nvm;
-}
-break;
-case NVME_CSI_ZONED:
-if (NVME_CC_CSS(n->bar.cc) == NVME_CC_CSS_CSI) {
-ns->iocs = nvme_cse_iocs_zoned;
-} else if (NVME_CC_CSS(n->bar.cc) == NVME_CC_CSS_NVM) {
-ns->iocs = nvme_cse_iocs_nvm;
-}
-break;
-}
+
+__nvme_select_ns_iocs(n, ns);
 }
 }
 
-- 
2.27.0

[PATCH V4 7/8] hw/block/nvme: support changed namespace asyncrohous event

2021-03-02 Thread Minwoo Im

If namespace inventory is changed due to some reasons (e.g., namespace
attachment/detachment), controller can send out event notifier to the
host to manage namespaces.

This patch sends out the AEN to the host after either attach or detach
namespaces from controllers.  To support clear of the event from the
controller, this patch also implemented Get Log Page command for Changed
Namespace List log type.  To return namespace id list through the
command, when namespace inventory is updated, id is added to the
per-controller list (changed_ns_list).

To indicate the support of this async event, this patch set
OAES(Optional Asynchronous Events Supported) in Identify Controller data
structure.

Signed-off-by: Minwoo Im 
---
 hw/block/nvme-ns.h   |  1 +
 hw/block/nvme.c  | 57 
 hw/block/nvme.h  |  4 
 include/block/nvme.h |  7 ++
 4 files changed, 69 insertions(+)

diff --git a/hw/block/nvme-ns.h b/hw/block/nvme-ns.h
index b0c00e115d81..318d3aebe1a8 100644
--- a/hw/block/nvme-ns.h
+++ b/hw/block/nvme-ns.h
@@ -53,6 +53,7 @@ typedef struct NvmeNamespace {
 uint8_t  csi;
 
 NvmeSubsystem   *subsys;
+QTAILQ_ENTRY(NvmeNamespace) entry;
 
 NvmeIdNsZoned   *id_ns_zoned;
 NvmeZone*zone_array;
diff --git a/hw/block/nvme.c b/hw/block/nvme.c
index 68c2e63d9412..7e6f91923fd7 100644
--- a/hw/block/nvme.c
+++ b/hw/block/nvme.c
@@ -2980,6 +2980,49 @@ static uint16_t nvme_error_info(NvmeCtrl *n, uint8_t 
rae, uint32_t buf_len,
 DMA_DIRECTION_FROM_DEVICE, req);
 }
 
+static uint16_t nvme_changed_nslist(NvmeCtrl *n, uint8_t rae, uint32_t buf_len,
+uint64_t off, NvmeRequest *req)
+{
+uint32_t nslist[1024];
+uint32_t trans_len;
+int i = 0;
+uint32_t nsid;
+
+memset(nslist, 0x0, sizeof(nslist));
+trans_len = MIN(sizeof(nslist) - off, buf_len);
+
+while ((nsid = find_first_bit(n->changed_nsids, NVME_CHANGED_NSID_SIZE)) !=
+NVME_CHANGED_NSID_SIZE) {
+/*
+ * If more than 1024 namespaces, the first entry in the log page should
+ * be set to 0x and the others to 0 as spec.
+ */
+if (i == ARRAY_SIZE(nslist)) {
+memset(nslist, 0x0, sizeof(nslist));
+nslist[0] = 0x;
+break;
+}
+
+nslist[i++] = nsid;
+clear_bit(nsid, n->changed_nsids);
+}
+
+/*
+ * Remove all the remaining list entries in case returns directly due to
+ * more than 1024 namespaces.
+ */
+if (nslist[0] == 0x) {
+bitmap_zero(n->changed_nsids, NVME_CHANGED_NSID_SIZE);
+}
+
+if (!rae) {
+nvme_clear_events(n, NVME_AER_TYPE_NOTICE);
+}
+
+return nvme_dma(n, ((uint8_t *)nslist) + off, trans_len,
+DMA_DIRECTION_FROM_DEVICE, req);
+}
+
 static uint16_t nvme_cmd_effects(NvmeCtrl *n, uint8_t csi, uint32_t buf_len,
  uint64_t off, NvmeRequest *req)
 {
@@ -3064,6 +3107,8 @@ static uint16_t nvme_get_log(NvmeCtrl *n, NvmeRequest 
*req)
 return nvme_smart_info(n, rae, len, off, req);
 case NVME_LOG_FW_SLOT_INFO:
 return nvme_fw_log_info(n, len, off, req);
+case NVME_LOG_CHANGED_NSLIST:
+return nvme_changed_nslist(n, rae, len, off, req);
 case NVME_LOG_CMD_EFFECTS:
 return nvme_cmd_effects(n, csi, len, off, req);
 default:
@@ -3920,6 +3965,16 @@ static uint16_t nvme_ns_attachment(NvmeCtrl *n, 
NvmeRequest *req)
 
 nvme_ns_detach(ctrl, ns);
 }
+
+/*
+ * Add namespace id to the changed namespace id list for event clearing
+ * via Get Log Page command.
+ */
+if (!test_and_set_bit(nsid, ctrl->changed_nsids)) {
+nvme_enqueue_event(ctrl, NVME_AER_TYPE_NOTICE,
+   NVME_AER_INFO_NOTICE_NS_ATTR_CHANGED,
+   NVME_LOG_CHANGED_NSLIST);
+}
 }
 
 return NVME_SUCCESS;
@@ -4910,6 +4965,8 @@ static void nvme_init_ctrl(NvmeCtrl *n, PCIDevice 
*pci_dev)
 
 id->cntlid = cpu_to_le16(n->cntlid);
 
+id->oaes = cpu_to_le32(NVME_OAES_NS_ATTR);
+
 id->rab = 6;
 
 if (n->params.use_intel_id) {
diff --git a/hw/block/nvme.h b/hw/block/nvme.h
index 74a00ab21a55..7245c2b638d9 100644
--- a/hw/block/nvme.h
+++ b/hw/block/nvme.h
@@ -177,6 +177,10 @@ typedef struct NvmeCtrl {
 QTAILQ_HEAD(, NvmeAsyncEvent) aer_queue;
 int aer_queued;
 
+/* Namespace ID is started with 1 so bitmap should be 1-based */
+#define NVME_CHANGED_NSID_SIZE  (NVME_MAX_NAMESPACES + 1)
+DECLARE_BITMAP(changed_nsids, NVME_CHANGED_NSID_SIZE);
+
 NvmeSubsystem   *subsys;
 
 NvmeNamespace   namespace;
diff --git a/include/block/nvme.h b/include/block/nvme.h
index 339784d9c23a..eb0b31e949c2 100644
--- a/include/block/nvme.h
+++ b/include/block/nvme.h
@@ -760,6 +760,7 @@

[PATCH V4 4/8] hw/block/nvme: support allocated namespace type

2021-03-02 Thread Minwoo Im

>From NVMe spec 1.4b "6.1.5. NSID and Namespace Relationships" defines
valid namespace types:

- Unallocated: Not exists in the NVMe subsystem
- Allocated: Exists in the NVMe subsystem
- Inactive: Not attached to the controller
- Active: Attached to the controller

This patch added support for allocated, but not attached namespace type:

!nvme_ns(n, nsid) && nvme_subsys_ns(n->subsys, nsid)

nvme_ns() returns attached namespace instance of the given controller
and nvme_subsys_ns() returns allocated namespace instance in the
subsystem.

Signed-off-by: Minwoo Im 
Tested-by: Klaus Jensen 
Reviewed-by: Klaus Jensen 
---
 hw/block/nvme-subsys.h | 13 +
 hw/block/nvme.c| 63 +++---
 2 files changed, 60 insertions(+), 16 deletions(-)

diff --git a/hw/block/nvme-subsys.h b/hw/block/nvme-subsys.h
index 8a0732b22316..14627f9ccb41 100644
--- a/hw/block/nvme-subsys.h
+++ b/hw/block/nvme-subsys.h
@@ -30,4 +30,17 @@ typedef struct NvmeSubsystem {
 int nvme_subsys_register_ctrl(NvmeCtrl *n, Error **errp);
 int nvme_subsys_register_ns(NvmeNamespace *ns, Error **errp);
 
+/*
+ * Return allocated namespace of the specified nsid in the subsystem.
+ */
+static inline NvmeNamespace *nvme_subsys_ns(NvmeSubsystem *subsys,
+uint32_t nsid)
+{
+if (!subsys) {
+return NULL;
+}
+
+return subsys->namespaces[nsid];
+}
+
 #endif /* NVME_SUBSYS_H */
diff --git a/hw/block/nvme.c b/hw/block/nvme.c
index f6aeae081840..53c4d59e09a7 100644
--- a/hw/block/nvme.c
+++ b/hw/block/nvme.c
@@ -3225,7 +3225,7 @@ static uint16_t nvme_identify_ctrl_csi(NvmeCtrl *n, 
NvmeRequest *req)
 return NVME_INVALID_FIELD | NVME_DNR;
 }
 
-static uint16_t nvme_identify_ns(NvmeCtrl *n, NvmeRequest *req)
+static uint16_t nvme_identify_ns(NvmeCtrl *n, NvmeRequest *req, bool active)
 {
 NvmeNamespace *ns;
 NvmeIdentify *c = (NvmeIdentify *)>cmd;
@@ -3239,7 +3239,14 @@ static uint16_t nvme_identify_ns(NvmeCtrl *n, 
NvmeRequest *req)
 
 ns = nvme_ns(n, nsid);
 if (unlikely(!ns)) {
-return nvme_rpt_empty_id_struct(n, req);
+if (!active) {
+ns = nvme_subsys_ns(n->subsys, nsid);
+if (!ns) {
+return nvme_rpt_empty_id_struct(n, req);
+}
+} else {
+return nvme_rpt_empty_id_struct(n, req);
+}
 }
 
 if (c->csi == NVME_CSI_NVM && nvme_csi_has_nvm_support(ns)) {
@@ -3250,7 +3257,8 @@ static uint16_t nvme_identify_ns(NvmeCtrl *n, NvmeRequest 
*req)
 return NVME_INVALID_CMD_SET | NVME_DNR;
 }
 
-static uint16_t nvme_identify_ns_csi(NvmeCtrl *n, NvmeRequest *req)
+static uint16_t nvme_identify_ns_csi(NvmeCtrl *n, NvmeRequest *req,
+bool active)
 {
 NvmeNamespace *ns;
 NvmeIdentify *c = (NvmeIdentify *)>cmd;
@@ -3264,7 +3272,14 @@ static uint16_t nvme_identify_ns_csi(NvmeCtrl *n, 
NvmeRequest *req)
 
 ns = nvme_ns(n, nsid);
 if (unlikely(!ns)) {
-return nvme_rpt_empty_id_struct(n, req);
+if (!active) {
+ns = nvme_subsys_ns(n->subsys, nsid);
+if (!ns) {
+return nvme_rpt_empty_id_struct(n, req);
+}
+} else {
+return nvme_rpt_empty_id_struct(n, req);
+}
 }
 
 if (c->csi == NVME_CSI_NVM && nvme_csi_has_nvm_support(ns)) {
@@ -3277,7 +3292,8 @@ static uint16_t nvme_identify_ns_csi(NvmeCtrl *n, 
NvmeRequest *req)
 return NVME_INVALID_FIELD | NVME_DNR;
 }
 
-static uint16_t nvme_identify_nslist(NvmeCtrl *n, NvmeRequest *req)
+static uint16_t nvme_identify_nslist(NvmeCtrl *n, NvmeRequest *req,
+bool active)
 {
 NvmeNamespace *ns;
 NvmeIdentify *c = (NvmeIdentify *)>cmd;
@@ -3302,7 +3318,14 @@ static uint16_t nvme_identify_nslist(NvmeCtrl *n, 
NvmeRequest *req)
 for (i = 1; i <= n->num_namespaces; i++) {
 ns = nvme_ns(n, i);
 if (!ns) {
-continue;
+if (!active) {
+ns = nvme_subsys_ns(n->subsys, i);
+if (!ns) {
+continue;
+}
+} else {
+continue;
+}
 }
 if (ns->params.nsid <= min_nsid) {
 continue;
@@ -3316,7 +3339,8 @@ static uint16_t nvme_identify_nslist(NvmeCtrl *n, 
NvmeRequest *req)
 return nvme_dma(n, list, data_len, DMA_DIRECTION_FROM_DEVICE, req);
 }
 
-static uint16_t nvme_identify_nslist_csi(NvmeCtrl *n, NvmeRequest *req)
+static uint16_t nvme_identify_nslist_csi(NvmeCtrl *n, NvmeRequest *req,
+bool active)
 {
 NvmeNamespace *ns;
 NvmeIdentify *c = (NvmeIdentify *)>cmd;
@@ -3342,7 +3366,14 @@ static uint16_t nvme_identify_nslist_csi(NvmeCtrl *n, 
NvmeRequest *req)
 for (i = 1; i <= n->num_namespaces; i++) {
 ns = nvme_ns(n, i);
 if (!ns) {
-continue;
+if

[PATCH V4 1/8] hw/block/nvme: support namespace detach

2021-03-02 Thread Minwoo Im

Given that now we have nvme-subsys device supported, we can manage
namespace allocated, but not attached: detached.  This patch introduced
a parameter for nvme-ns device named 'detached'.  This parameter
indicates whether the given namespace device is detached from
a entire NVMe subsystem('subsys' given case, shared namespace) or a
controller('bus' given case, private namespace).

- Allocated namespace

  1) Shared ns in the subsystem 'subsys0':

 -device nvme-ns,id=ns1,drive=blknvme0,nsid=1,subsys=subsys0,detached=true

  2) Private ns for the controller 'nvme0' of the subsystem 'subsys0':

 -device nvme-subsys,id=subsys0
 -device nvme,serial=foo,id=nvme0,subsys=subsys0
 -device nvme-ns,id=ns1,drive=blknvme0,nsid=1,bus=nvme0,detached=true

  3) (Invalid case) Controller 'nvme0' has no subsystem to manage ns:

 -device nvme,serial=foo,id=nvme0
 -device nvme-ns,id=ns1,drive=blknvme0,nsid=1,bus=nvme0,detached=true

Signed-off-by: Minwoo Im 
Tested-by: Klaus Jensen 
Reviewed-by: Klaus Jensen 
---
 hw/block/nvme-ns.c |  1 +
 hw/block/nvme-ns.h |  1 +
 hw/block/nvme-subsys.h |  1 +
 hw/block/nvme.c| 41 +++--
 hw/block/nvme.h| 22 ++
 5 files changed, 64 insertions(+), 2 deletions(-)

diff --git a/hw/block/nvme-ns.c b/hw/block/nvme-ns.c
index 0e8760020483..eda6a0c003a4 100644
--- a/hw/block/nvme-ns.c
+++ b/hw/block/nvme-ns.c
@@ -399,6 +399,7 @@ static Property nvme_ns_props[] = {
 DEFINE_BLOCK_PROPERTIES(NvmeNamespace, blkconf),
 DEFINE_PROP_LINK("subsys", NvmeNamespace, subsys, TYPE_NVME_SUBSYS,
  NvmeSubsystem *),
+DEFINE_PROP_BOOL("detached", NvmeNamespace, params.detached, false),
 DEFINE_PROP_UINT32("nsid", NvmeNamespace, params.nsid, 0),
 DEFINE_PROP_UUID("uuid", NvmeNamespace, params.uuid),
 DEFINE_PROP_UINT16("mssrl", NvmeNamespace, params.mssrl, 128),
diff --git a/hw/block/nvme-ns.h b/hw/block/nvme-ns.h
index 7af6884862b5..b0c00e115d81 100644
--- a/hw/block/nvme-ns.h
+++ b/hw/block/nvme-ns.h
@@ -26,6 +26,7 @@ typedef struct NvmeZone {
 } NvmeZone;
 
 typedef struct NvmeNamespaceParams {
+bool detached;
 uint32_t nsid;
 QemuUUID uuid;
 
diff --git a/hw/block/nvme-subsys.h b/hw/block/nvme-subsys.h
index ccf6a71398d3..890d118117dc 100644
--- a/hw/block/nvme-subsys.h
+++ b/hw/block/nvme-subsys.h
@@ -23,6 +23,7 @@ typedef struct NvmeSubsystem {
 uint8_t subnqn[256];
 
 NvmeCtrl*ctrls[NVME_SUBSYS_MAX_CTRLS];
+/* Allocated namespaces for this subsystem */
 NvmeNamespace *namespaces[NVME_SUBSYS_MAX_NAMESPACES];
 } NvmeSubsystem;
 
diff --git a/hw/block/nvme.c b/hw/block/nvme.c
index edd0b85c10ce..f6aeae081840 100644
--- a/hw/block/nvme.c
+++ b/hw/block/nvme.c
@@ -23,7 +23,7 @@
  *  max_ioqpairs=, \
  *  aerl=, aer_max_queued=, \
  *  mdts=,zoned.append_size_limit=, \
- *  subsys= \
+ *  subsys=,detached=
  *  -device nvme-ns,drive=,bus=,nsid=,\
  *  zoned=, \
  *  subsys=
@@ -82,6 +82,13 @@
  *   controllers in the subsystem. Otherwise, `bus` must be given to attach
  *   this namespace to a specified single controller as a non-shared namespace.
  *
+ * - `detached`
+ *   Not to attach the namespace device to controllers in the NVMe subsystem
+ *   during boot-up. If not given, namespaces are all attahced to all
+ *   controllers in the subsystem by default.
+ *   It's mutual exclusive with 'bus' parameter. It's only valid in case
+ *   `subsys` is provided.
+ *
  * Setting `zoned` to true selects Zoned Command Set at the namespace.
  * In this case, the following namespace properties are available to configure
  * zoned operation:
@@ -4613,6 +4620,20 @@ static void nvme_init_state(NvmeCtrl *n)
 n->aer_reqs = g_new0(NvmeRequest *, n->params.aerl + 1);
 }
 
+static int nvme_attach_namespace(NvmeCtrl *n, NvmeNamespace *ns, Error **errp)
+{
+if (nvme_ns_is_attached(n, ns)) {
+error_setg(errp,
+   "namespace %d is already attached to controller %d",
+   nvme_nsid(ns), n->cntlid);
+return -1;
+}
+
+nvme_ns_attach(n, ns);
+
+return 0;
+}
+
 int nvme_register_namespace(NvmeCtrl *n, NvmeNamespace *ns, Error **errp)
 {
 uint32_t nsid = nvme_nsid(ns);
@@ -4644,7 +4665,23 @@ int nvme_register_namespace(NvmeCtrl *n, NvmeNamespace 
*ns, Error **errp)
 
 trace_pci_nvme_register_namespace(nsid);
 
-n->namespaces[nsid - 1] = ns;
+/*
+ * If subsys is not given, namespae is always attached to the controller
+ * because there's no subsystem to manage namespace allocation.
+ */
+if (!n->subsys) {
+if (ns->params.detached) {
+error_setg(errp,
+   "detached needs nvme-subsys specified nvme or nvme-ns");
+return -1;
+

[PATCH V4 6/8] hw/block/nvme: support namespace attachment command

2021-03-02 Thread Minwoo Im

This patch supports Namespace Attachment command for the pre-defined
nvme-ns device nodes.  Of course, attach/detach namespace should only be
supported in case 'subsys' is given.  This is because if we detach a
namespace from a controller, somebody needs to manage the detached, but
allocated namespace in the NVMe subsystem.

As command effect for the namespace attachment command is registered,
the host will be notified that namespace inventory is changed so that
host will rescan the namespace inventory after this command.  For
example, kernel driver manages this command effect via passthru IOCTL.

Signed-off-by: Minwoo Im 
---
 hw/block/nvme-subsys.h | 10 +++
 hw/block/nvme.c| 61 +-
 hw/block/nvme.h|  5 
 hw/block/trace-events  |  2 ++
 include/block/nvme.h   |  6 +
 5 files changed, 83 insertions(+), 1 deletion(-)

diff --git a/hw/block/nvme-subsys.h b/hw/block/nvme-subsys.h
index 14627f9ccb41..ef4bec928eae 100644
--- a/hw/block/nvme-subsys.h
+++ b/hw/block/nvme-subsys.h
@@ -30,6 +30,16 @@ typedef struct NvmeSubsystem {
 int nvme_subsys_register_ctrl(NvmeCtrl *n, Error **errp);
 int nvme_subsys_register_ns(NvmeNamespace *ns, Error **errp);
 
+static inline NvmeCtrl *nvme_subsys_ctrl(NvmeSubsystem *subsys,
+uint32_t cntlid)
+{
+if (!subsys) {
+return NULL;
+}
+
+return subsys->ctrls[cntlid];
+}
+
 /*
  * Return allocated namespace of the specified nsid in the subsystem.
  */
diff --git a/hw/block/nvme.c b/hw/block/nvme.c
index b18ab0ef810f..68c2e63d9412 100644
--- a/hw/block/nvme.c
+++ b/hw/block/nvme.c
@@ -187,6 +187,7 @@ static const uint32_t nvme_cse_acs[256] = {
 [NVME_ADM_CMD_SET_FEATURES] = NVME_CMD_EFF_CSUPP,
 [NVME_ADM_CMD_GET_FEATURES] = NVME_CMD_EFF_CSUPP,
 [NVME_ADM_CMD_ASYNC_EV_REQ] = NVME_CMD_EFF_CSUPP,
+[NVME_ADM_CMD_NS_ATTACHMENT]= NVME_CMD_EFF_CSUPP | NVME_CMD_EFF_NIC,
 };
 
 static const uint32_t nvme_cse_iocs_none[256];
@@ -3868,6 +3869,62 @@ static uint16_t nvme_aer(NvmeCtrl *n, NvmeRequest *req)
 return NVME_NO_COMPLETE;
 }
 
+static void __nvme_select_ns_iocs(NvmeCtrl *n, NvmeNamespace *ns);
+static uint16_t nvme_ns_attachment(NvmeCtrl *n, NvmeRequest *req)
+{
+NvmeNamespace *ns;
+NvmeCtrl *ctrl;
+uint16_t list[NVME_CONTROLLER_LIST_SIZE] = {};
+uint32_t nsid = le32_to_cpu(req->cmd.nsid);
+uint32_t dw10 = le32_to_cpu(req->cmd.cdw10);
+bool attach = !(dw10 & 0xf);
+uint16_t *nr_ids = [0];
+uint16_t *ids = [1];
+uint16_t ret;
+int i;
+
+trace_pci_nvme_ns_attachment(nvme_cid(req), dw10 & 0xf);
+
+ns = nvme_subsys_ns(n->subsys, nsid);
+if (!ns) {
+return NVME_INVALID_FIELD | NVME_DNR;
+}
+
+ret = nvme_dma(n, (uint8_t *)list, 4096,
+   DMA_DIRECTION_TO_DEVICE, req);
+if (ret) {
+return ret;
+}
+
+if (!*nr_ids) {
+return NVME_NS_CTRL_LIST_INVALID | NVME_DNR;
+}
+
+for (i = 0; i < *nr_ids; i++) {
+ctrl = nvme_subsys_ctrl(n->subsys, ids[i]);
+if (!ctrl) {
+return NVME_NS_CTRL_LIST_INVALID | NVME_DNR;
+}
+
+if (attach) {
+if (nvme_ns_is_attached(ctrl, ns)) {
+return NVME_NS_ALREADY_ATTACHED | NVME_DNR;
+}
+
+nvme_ns_attach(ctrl, ns);
+__nvme_select_ns_iocs(ctrl, ns);
+} else {
+if (!nvme_ns_is_attached(ctrl, ns)) {
+return NVME_NS_NOT_ATTACHED | NVME_DNR;
+}
+
+nvme_ns_detach(ctrl, ns);
+}
+}
+
+return NVME_SUCCESS;
+}
+
 static uint16_t nvme_admin_cmd(NvmeCtrl *n, NvmeRequest *req)
 {
 trace_pci_nvme_admin_cmd(nvme_cid(req), nvme_sqid(req), req->cmd.opcode,
@@ -3899,6 +3956,8 @@ static uint16_t nvme_admin_cmd(NvmeCtrl *n, NvmeRequest 
*req)
 return nvme_get_feature(n, req);
 case NVME_ADM_CMD_ASYNC_EV_REQ:
 return nvme_aer(n, req);
+case NVME_ADM_CMD_NS_ATTACHMENT:
+return nvme_ns_attachment(n, req);
 default:
 assert(false);
 }
@@ -4865,7 +4924,7 @@ static void nvme_init_ctrl(NvmeCtrl *n, PCIDevice 
*pci_dev)
 
 id->mdts = n->params.mdts;
 id->ver = cpu_to_le32(NVME_SPEC_VER);
-id->oacs = cpu_to_le16(0);
+id->oacs = cpu_to_le16(NVME_OACS_NS_MGMT);
 id->cntrltype = 0x1;
 
 /*
diff --git a/hw/block/nvme.h b/hw/block/nvme.h
index 7599d6b1a41b..74a00ab21a55 100644
--- a/hw/block/nvme.h
+++ b/hw/block/nvme.h
@@ -220,6 +220,11 @@ static inline void nvme_ns_attach(NvmeCtrl *n, 
NvmeNamespace *ns)
 n->namespaces[nvme_nsid(ns) - 1] = ns;
 }
 
+static inline void nvme_ns_detach(NvmeCtrl *n, NvmeNamespace *ns)
+{
+n->namespaces[nvme_nsid(ns) - 1] = NULL;
+}
+
 static inline NvmeCQueue *nvme_cq(NvmeRequest *req)
 {
 NvmeSQueue *sq = req->sq;
diff --git a/hw/block/trace-events b/hw/block/trace-events
index 25ba51ea5405..98d542c999e2

[PATCH V4 3/8] hw/block/nvme: fix allocated namespace list to 256

2021-03-02 Thread Minwoo Im

Expand allocated namespace list (subsys->namespaces) to have 256 entries
which is a value lager than at least NVME_MAX_NAMESPACES which is for
attached namespace list in a controller.

Allocated namespace list should at least larger than attached namespace
list.

n->num_namespaces = NVME_MAX_NAMESPACES;

The above line will set the NN field by id->nn so that the subsystem
should also prepare at least this number of namespace list entries.

Signed-off-by: Minwoo Im 
Tested-by: Klaus Jensen 
Reviewed-by: Klaus Jensen 
---
 hw/block/nvme-subsys.h | 2 +-
 hw/block/nvme.h| 6 ++
 2 files changed, 7 insertions(+), 1 deletion(-)

diff --git a/hw/block/nvme-subsys.h b/hw/block/nvme-subsys.h
index 574774390c4c..8a0732b22316 100644
--- a/hw/block/nvme-subsys.h
+++ b/hw/block/nvme-subsys.h
@@ -14,7 +14,7 @@
 OBJECT_CHECK(NvmeSubsystem, (obj), TYPE_NVME_SUBSYS)
 
 #define NVME_SUBSYS_MAX_CTRLS   32
-#define NVME_SUBSYS_MAX_NAMESPACES  32
+#define NVME_SUBSYS_MAX_NAMESPACES  256
 
 typedef struct NvmeCtrl NvmeCtrl;
 typedef struct NvmeNamespace NvmeNamespace;
diff --git a/hw/block/nvme.h b/hw/block/nvme.h
index 51b8739b4d1e..7599d6b1a41b 100644
--- a/hw/block/nvme.h
+++ b/hw/block/nvme.h
@@ -10,6 +10,12 @@
 #define NVME_DEFAULT_ZONE_SIZE   (128 * MiB)
 #define NVME_DEFAULT_MAX_ZA_SIZE (128 * KiB)
 
+/*
+ * Subsystem namespace list for allocated namespaces should be larger than
+ * attached namespace list in a controller.
+ */
+QEMU_BUILD_BUG_ON(NVME_MAX_NAMESPACES > NVME_SUBSYS_MAX_NAMESPACES);
+
 typedef struct NvmeParams {
 char *serial;
 uint32_t num_queues; /* deprecated since 5.1 */
-- 
2.27.0

[PATCH V4 0/8] hw/block/nvme: support namespace attachment

2021-03-02 Thread Minwoo Im

Hello,

This series supports namespace attachment: attach and detach.  This is
the fourth version of series with replacing changed namespace list to
bitmap to indicate changed namespace IDs.

Please review.

Thanks,

Since V3:
  - Replace changed ns list to bitmap to not only represent relationship
between controller and namespace, but also avoid duplications of
nsids in the list.  (Klaus)

Since V2:
  - Added command effects (namespace inventory changed) for namespace
attach command.  (Keith)
  - Added [7/8] patch to support asynchronus event when namespace
inventory is updated.  (Keith)
  - Added review and tested tag from Klaus to all the patches, but [6/8]
and [7/8].

Since V1:
  - Fix to take 'ctrl' which is given from the command rather than 'n'.
(Klaus)
  - Add a [7/7] patch to support CNS 12h Identify command (Namespace
Attached Controller list).

Minwoo Im (8):
  hw/block/nvme: support namespace detach
  hw/block/nvme: fix namespaces array to 1-based
  hw/block/nvme: fix allocated namespace list to 256
  hw/block/nvme: support allocated namespace type
  hw/block/nvme: refactor nvme_select_ns_iocs
  hw/block/nvme: support namespace attachment command
  hw/block/nvme: support changed namespace asyncrohous event
  hw/block/nvme: support Identify NS Attached Controller List

 hw/block/nvme-ns.c |   1 +
 hw/block/nvme-ns.h |   2 +
 hw/block/nvme-subsys.h |  28 +++-
 hw/block/nvme.c| 300 -
 hw/block/nvme.h|  37 +
 hw/block/trace-events  |   3 +
 include/block/nvme.h   |  14 ++
 7 files changed, 349 insertions(+), 36 deletions(-)

-- 
2.27.0

[PATCH V4 2/8] hw/block/nvme: fix namespaces array to 1-based

2021-03-02 Thread Minwoo Im

subsys->namespaces array used to be sized to NVME_SUBSYS_MAX_NAMESPACES.
But subsys->namespaces are being accessed with 1-based namespace id
which means the very first array entry will always be empty(NULL).

Signed-off-by: Minwoo Im 
Tested-by: Klaus Jensen 
Reviewed-by: Klaus Jensen 
---
 hw/block/nvme-subsys.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/hw/block/nvme-subsys.h b/hw/block/nvme-subsys.h
index 890d118117dc..574774390c4c 100644
--- a/hw/block/nvme-subsys.h
+++ b/hw/block/nvme-subsys.h
@@ -24,7 +24,7 @@ typedef struct NvmeSubsystem {
 
 NvmeCtrl*ctrls[NVME_SUBSYS_MAX_CTRLS];
 /* Allocated namespaces for this subsystem */
-NvmeNamespace *namespaces[NVME_SUBSYS_MAX_NAMESPACES];
+NvmeNamespace *namespaces[NVME_SUBSYS_MAX_NAMESPACES + 1];
 } NvmeSubsystem;
 
 int nvme_subsys_register_ctrl(NvmeCtrl *n, Error **errp);
-- 
2.27.0

Re: [PATCH V3 7/8] hw/block/nvme: support changed namespace asyncrohous event

2021-03-02 Thread Minwoo Im

On 21-03-01 06:56:02, Klaus Jensen wrote:
> On Mar  1 01:10, Minwoo Im wrote:
> > If namespace inventory is changed due to some reasons (e.g., namespace
> > attachment/detachment), controller can send out event notifier to the
> > host to manage namespaces.
> > 
> > This patch sends out the AEN to the host after either attach or detach
> > namespaces from controllers.  To support clear of the event from the
> > controller, this patch also implemented Get Log Page command for Changed
> > Namespace List log type.  To return namespace id list through the
> > command, when namespace inventory is updated, id is added to the
> > per-controller list (changed_ns_list).
> > 
> > To indicate the support of this async event, this patch set
> > OAES(Optional Asynchronous Events Supported) in Identify Controller data
> > structure.
> > 
> > Signed-off-by: Minwoo Im 
> > ---
> >  hw/block/nvme.c  | 44 
> >  hw/block/nvme.h  |  7 +++
> >  include/block/nvme.h |  7 +++
> >  3 files changed, 58 insertions(+)
> > 
> > diff --git a/hw/block/nvme.c b/hw/block/nvme.c
> > index 68c2e63d9412..fc06f806e58e 100644
> > --- a/hw/block/nvme.c
> > +++ b/hw/block/nvme.c
> > @@ -2980,6 +2980,32 @@ static uint16_t nvme_error_info(NvmeCtrl *n, uint8_t 
> > rae, uint32_t buf_len,
> >  DMA_DIRECTION_FROM_DEVICE, req);
> >  }
> >  
> > +static uint16_t nvme_changed_nslist(NvmeCtrl *n, uint8_t rae, uint32_t 
> > buf_len,
> > +uint64_t off, NvmeRequest *req)
> > +{
> > +uint32_t nslist[1024];
> > +uint32_t trans_len;
> > +NvmeChangedNs *ns, *next;
> > +int i = 0;
> > +
> > +memset(nslist, 0x0, sizeof(nslist));
> > +trans_len = MIN(sizeof(nslist) - off, buf_len);
> > +
> > +QTAILQ_FOREACH_SAFE(ns, >changed_ns_list, entry, next) {
> > +nslist[i++] = ns->nsid;
> > +
> > +QTAILQ_REMOVE(>changed_ns_list, ns, entry);
> > +g_free(ns);
> > +}
> > +
> > +if (!rae) {
> > +nvme_clear_events(n, NVME_AER_TYPE_NOTICE);
> > +}
> > +
> > +return nvme_dma(n, ((uint8_t *)nslist) + off, trans_len,
> > +DMA_DIRECTION_FROM_DEVICE, req);
> > +}
> > +
> >  static uint16_t nvme_cmd_effects(NvmeCtrl *n, uint8_t csi, uint32_t 
> > buf_len,
> >   uint64_t off, NvmeRequest *req)
> >  {
> > @@ -3064,6 +3090,8 @@ static uint16_t nvme_get_log(NvmeCtrl *n, NvmeRequest 
> > *req)
> >  return nvme_smart_info(n, rae, len, off, req);
> >  case NVME_LOG_FW_SLOT_INFO:
> >  return nvme_fw_log_info(n, len, off, req);
> > +case NVME_LOG_CHANGED_NSLIST:
> > +return nvme_changed_nslist(n, rae, len, off, req);
> >  case NVME_LOG_CMD_EFFECTS:
> >  return nvme_cmd_effects(n, csi, len, off, req);
> >  default:
> > @@ -3882,6 +3910,7 @@ static uint16_t nvme_ns_attachment(NvmeCtrl *n, 
> > NvmeRequest *req)
> >  uint16_t *ids = [1];
> >  uint16_t ret;
> >  int i;
> > +NvmeChangedNs *changed_nsid;
> >  
> >  trace_pci_nvme_ns_attachment(nvme_cid(req), dw10 & 0xf);
> >  
> > @@ -3920,6 +3949,18 @@ static uint16_t nvme_ns_attachment(NvmeCtrl *n, 
> > NvmeRequest *req)
> >  
> >  nvme_ns_detach(ctrl, ns);
> >  }
> > +
> > +/*
> > + * Add namespace id to the changed namespace id list for event 
> > clearing
> > + * via Get Log Page command.
> > + */
> > +changed_nsid = g_new(NvmeChangedNs, 1);
> > +changed_nsid->nsid = nsid;
> > +QTAILQ_INSERT_TAIL(>changed_ns_list, changed_nsid, entry);
> > +
> > +nvme_enqueue_event(ctrl, NVME_AER_TYPE_NOTICE,
> > +   NVME_AER_INFO_NOTICE_NS_ATTR_CHANGED,
> > +   NVME_LOG_CHANGED_NSLIST);
> >  }
> 
> If one just keeps attaching/detaching we end up with more than 1024
> entries here and go out of bounds in nvme_changed_nslist.
> 
> How about having the QTAILQ_ENTRY directly on the NvmeNamespace struct
> and use QTAILQ_IN_USE to check if the namespace is already in the list?

QTAILQ_IN_USE might be tough to represent relationship between
controller and namespace itself.  So, I will work on this with standard
bitmap rather than the list.  I think bitmap will be easier to represent
the relationship.

[PATCH V3 7/8] hw/block/nvme: support changed namespace asyncrohous event

2021-02-28 Thread Minwoo Im

If namespace inventory is changed due to some reasons (e.g., namespace
attachment/detachment), controller can send out event notifier to the
host to manage namespaces.

This patch sends out the AEN to the host after either attach or detach
namespaces from controllers.  To support clear of the event from the
controller, this patch also implemented Get Log Page command for Changed
Namespace List log type.  To return namespace id list through the
command, when namespace inventory is updated, id is added to the
per-controller list (changed_ns_list).

To indicate the support of this async event, this patch set
OAES(Optional Asynchronous Events Supported) in Identify Controller data
structure.

Signed-off-by: Minwoo Im 
---
 hw/block/nvme.c  | 44 
 hw/block/nvme.h  |  7 +++
 include/block/nvme.h |  7 +++
 3 files changed, 58 insertions(+)

diff --git a/hw/block/nvme.c b/hw/block/nvme.c
index 68c2e63d9412..fc06f806e58e 100644
--- a/hw/block/nvme.c
+++ b/hw/block/nvme.c
@@ -2980,6 +2980,32 @@ static uint16_t nvme_error_info(NvmeCtrl *n, uint8_t 
rae, uint32_t buf_len,
 DMA_DIRECTION_FROM_DEVICE, req);
 }
 
+static uint16_t nvme_changed_nslist(NvmeCtrl *n, uint8_t rae, uint32_t buf_len,
+uint64_t off, NvmeRequest *req)
+{
+uint32_t nslist[1024];
+uint32_t trans_len;
+NvmeChangedNs *ns, *next;
+int i = 0;
+
+memset(nslist, 0x0, sizeof(nslist));
+trans_len = MIN(sizeof(nslist) - off, buf_len);
+
+QTAILQ_FOREACH_SAFE(ns, >changed_ns_list, entry, next) {
+nslist[i++] = ns->nsid;
+
+QTAILQ_REMOVE(>changed_ns_list, ns, entry);
+g_free(ns);
+}
+
+if (!rae) {
+nvme_clear_events(n, NVME_AER_TYPE_NOTICE);
+}
+
+return nvme_dma(n, ((uint8_t *)nslist) + off, trans_len,
+DMA_DIRECTION_FROM_DEVICE, req);
+}
+
 static uint16_t nvme_cmd_effects(NvmeCtrl *n, uint8_t csi, uint32_t buf_len,
  uint64_t off, NvmeRequest *req)
 {
@@ -3064,6 +3090,8 @@ static uint16_t nvme_get_log(NvmeCtrl *n, NvmeRequest 
*req)
 return nvme_smart_info(n, rae, len, off, req);
 case NVME_LOG_FW_SLOT_INFO:
 return nvme_fw_log_info(n, len, off, req);
+case NVME_LOG_CHANGED_NSLIST:
+return nvme_changed_nslist(n, rae, len, off, req);
 case NVME_LOG_CMD_EFFECTS:
 return nvme_cmd_effects(n, csi, len, off, req);
 default:
@@ -3882,6 +3910,7 @@ static uint16_t nvme_ns_attachment(NvmeCtrl *n, 
NvmeRequest *req)
 uint16_t *ids = [1];
 uint16_t ret;
 int i;
+NvmeChangedNs *changed_nsid;
 
 trace_pci_nvme_ns_attachment(nvme_cid(req), dw10 & 0xf);
 
@@ -3920,6 +3949,18 @@ static uint16_t nvme_ns_attachment(NvmeCtrl *n, 
NvmeRequest *req)
 
 nvme_ns_detach(ctrl, ns);
 }
+
+/*
+ * Add namespace id to the changed namespace id list for event clearing
+ * via Get Log Page command.
+ */
+changed_nsid = g_new(NvmeChangedNs, 1);
+changed_nsid->nsid = nsid;
+QTAILQ_INSERT_TAIL(>changed_ns_list, changed_nsid, entry);
+
+nvme_enqueue_event(ctrl, NVME_AER_TYPE_NOTICE,
+   NVME_AER_INFO_NOTICE_NS_ATTR_CHANGED,
+   NVME_LOG_CHANGED_NSLIST);
 }
 
 return NVME_SUCCESS;
@@ -4714,6 +4755,7 @@ static void nvme_init_state(NvmeCtrl *n)
 n->features.temp_thresh_hi = NVME_TEMPERATURE_WARNING;
 n->starttime_ms = qemu_clock_get_ms(QEMU_CLOCK_VIRTUAL);
 n->aer_reqs = g_new0(NvmeRequest *, n->params.aerl + 1);
+QTAILQ_INIT(>changed_ns_list);
 }
 
 static int nvme_attach_namespace(NvmeCtrl *n, NvmeNamespace *ns, Error **errp)
@@ -4910,6 +4952,8 @@ static void nvme_init_ctrl(NvmeCtrl *n, PCIDevice 
*pci_dev)
 
 id->cntlid = cpu_to_le16(n->cntlid);
 
+id->oaes = cpu_to_le32(NVME_OAES_NS_ATTR);
+
 id->rab = 6;
 
 if (n->params.use_intel_id) {
diff --git a/hw/block/nvme.h b/hw/block/nvme.h
index 74a00ab21a55..d5eaea003ea5 100644
--- a/hw/block/nvme.h
+++ b/hw/block/nvme.h
@@ -132,6 +132,11 @@ typedef struct NvmeFeatureVal {
 uint32_tasync_config;
 } NvmeFeatureVal;
 
+typedef struct NvmeChangedNs {
+uint32_t nsid;
+QTAILQ_ENTRY(NvmeChangedNs) entry;
+} NvmeChangedNs;
+
 typedef struct NvmeCtrl {
 PCIDeviceparent_obj;
 MemoryRegion bar0;
@@ -177,6 +182,8 @@ typedef struct NvmeCtrl {
 QTAILQ_HEAD(, NvmeAsyncEvent) aer_queue;
 int aer_queued;
 
+QTAILQ_HEAD(, NvmeChangedNs) changed_ns_list;   /* Changed NS list log */
+
 NvmeSubsystem   *subsys;
 
 NvmeNamespace   namespace;
diff --git a/include/block/nvme.h b/include/block/nvme.h
index 339784d9c23a..eb0b31e949c2 100644
--- a/include/block/nvme.h
+++ b/include/block/nvme.h
@@ -760,6 +760,7 @@ typedef struct QEMU_PACKED NvmeCopySourceRange {
 enum NvmeAsyncEventRequest {

[PATCH V3 6/8] hw/block/nvme: support namespace attachment command

2021-02-28 Thread Minwoo Im

This patch supports Namespace Attachment command for the pre-defined
nvme-ns device nodes.  Of course, attach/detach namespace should only be
supported in case 'subsys' is given.  This is because if we detach a
namespace from a controller, somebody needs to manage the detached, but
allocated namespace in the NVMe subsystem.

As command effect for the namespace attachment command is registered,
the host will be notified that namespace inventory is changed so that
host will rescan the namespace inventory after this command.  For
example, kernel driver manages this command effect via passthru IOCTL.

Signed-off-by: Minwoo Im 
---
 hw/block/nvme-subsys.h | 10 +++
 hw/block/nvme.c| 61 +-
 hw/block/nvme.h|  5 
 hw/block/trace-events  |  2 ++
 include/block/nvme.h   |  6 +
 5 files changed, 83 insertions(+), 1 deletion(-)

diff --git a/hw/block/nvme-subsys.h b/hw/block/nvme-subsys.h
index 14627f9ccb41..ef4bec928eae 100644
--- a/hw/block/nvme-subsys.h
+++ b/hw/block/nvme-subsys.h
@@ -30,6 +30,16 @@ typedef struct NvmeSubsystem {
 int nvme_subsys_register_ctrl(NvmeCtrl *n, Error **errp);
 int nvme_subsys_register_ns(NvmeNamespace *ns, Error **errp);
 
+static inline NvmeCtrl *nvme_subsys_ctrl(NvmeSubsystem *subsys,
+uint32_t cntlid)
+{
+if (!subsys) {
+return NULL;
+}
+
+return subsys->ctrls[cntlid];
+}
+
 /*
  * Return allocated namespace of the specified nsid in the subsystem.
  */
diff --git a/hw/block/nvme.c b/hw/block/nvme.c
index b18ab0ef810f..68c2e63d9412 100644
--- a/hw/block/nvme.c
+++ b/hw/block/nvme.c
@@ -187,6 +187,7 @@ static const uint32_t nvme_cse_acs[256] = {
 [NVME_ADM_CMD_SET_FEATURES] = NVME_CMD_EFF_CSUPP,
 [NVME_ADM_CMD_GET_FEATURES] = NVME_CMD_EFF_CSUPP,
 [NVME_ADM_CMD_ASYNC_EV_REQ] = NVME_CMD_EFF_CSUPP,
+[NVME_ADM_CMD_NS_ATTACHMENT]= NVME_CMD_EFF_CSUPP | NVME_CMD_EFF_NIC,
 };
 
 static const uint32_t nvme_cse_iocs_none[256];
@@ -3868,6 +3869,62 @@ static uint16_t nvme_aer(NvmeCtrl *n, NvmeRequest *req)
 return NVME_NO_COMPLETE;
 }
 
+static void __nvme_select_ns_iocs(NvmeCtrl *n, NvmeNamespace *ns);
+static uint16_t nvme_ns_attachment(NvmeCtrl *n, NvmeRequest *req)
+{
+NvmeNamespace *ns;
+NvmeCtrl *ctrl;
+uint16_t list[NVME_CONTROLLER_LIST_SIZE] = {};
+uint32_t nsid = le32_to_cpu(req->cmd.nsid);
+uint32_t dw10 = le32_to_cpu(req->cmd.cdw10);
+bool attach = !(dw10 & 0xf);
+uint16_t *nr_ids = [0];
+uint16_t *ids = [1];
+uint16_t ret;
+int i;
+
+trace_pci_nvme_ns_attachment(nvme_cid(req), dw10 & 0xf);
+
+ns = nvme_subsys_ns(n->subsys, nsid);
+if (!ns) {
+return NVME_INVALID_FIELD | NVME_DNR;
+}
+
+ret = nvme_dma(n, (uint8_t *)list, 4096,
+   DMA_DIRECTION_TO_DEVICE, req);
+if (ret) {
+return ret;
+}
+
+if (!*nr_ids) {
+return NVME_NS_CTRL_LIST_INVALID | NVME_DNR;
+}
+
+for (i = 0; i < *nr_ids; i++) {
+ctrl = nvme_subsys_ctrl(n->subsys, ids[i]);
+if (!ctrl) {
+return NVME_NS_CTRL_LIST_INVALID | NVME_DNR;
+}
+
+if (attach) {
+if (nvme_ns_is_attached(ctrl, ns)) {
+return NVME_NS_ALREADY_ATTACHED | NVME_DNR;
+}
+
+nvme_ns_attach(ctrl, ns);
+__nvme_select_ns_iocs(ctrl, ns);
+} else {
+if (!nvme_ns_is_attached(ctrl, ns)) {
+return NVME_NS_NOT_ATTACHED | NVME_DNR;
+}
+
+nvme_ns_detach(ctrl, ns);
+}
+}
+
+return NVME_SUCCESS;
+}
+
 static uint16_t nvme_admin_cmd(NvmeCtrl *n, NvmeRequest *req)
 {
 trace_pci_nvme_admin_cmd(nvme_cid(req), nvme_sqid(req), req->cmd.opcode,
@@ -3899,6 +3956,8 @@ static uint16_t nvme_admin_cmd(NvmeCtrl *n, NvmeRequest 
*req)
 return nvme_get_feature(n, req);
 case NVME_ADM_CMD_ASYNC_EV_REQ:
 return nvme_aer(n, req);
+case NVME_ADM_CMD_NS_ATTACHMENT:
+return nvme_ns_attachment(n, req);
 default:
 assert(false);
 }
@@ -4865,7 +4924,7 @@ static void nvme_init_ctrl(NvmeCtrl *n, PCIDevice 
*pci_dev)
 
 id->mdts = n->params.mdts;
 id->ver = cpu_to_le32(NVME_SPEC_VER);
-id->oacs = cpu_to_le16(0);
+id->oacs = cpu_to_le16(NVME_OACS_NS_MGMT);
 id->cntrltype = 0x1;
 
 /*
diff --git a/hw/block/nvme.h b/hw/block/nvme.h
index 7599d6b1a41b..74a00ab21a55 100644
--- a/hw/block/nvme.h
+++ b/hw/block/nvme.h
@@ -220,6 +220,11 @@ static inline void nvme_ns_attach(NvmeCtrl *n, 
NvmeNamespace *ns)
 n->namespaces[nvme_nsid(ns) - 1] = ns;
 }
 
+static inline void nvme_ns_detach(NvmeCtrl *n, NvmeNamespace *ns)
+{
+n->namespaces[nvme_nsid(ns) - 1] = NULL;
+}
+
 static inline NvmeCQueue *nvme_cq(NvmeRequest *req)
 {
 NvmeSQueue *sq = req->sq;
diff --git a/hw/block/trace-events b/hw/block/trace-events
index 25ba51ea5405..98d542c999e2

[PATCH V3 2/8] hw/block/nvme: fix namespaces array to 1-based

2021-02-28 Thread Minwoo Im

subsys->namespaces array used to be sized to NVME_SUBSYS_MAX_NAMESPACES.
But subsys->namespaces are being accessed with 1-based namespace id
which means the very first array entry will always be empty(NULL).

Signed-off-by: Minwoo Im 
Tested-by: Klaus Jensen 
Reviewed-by: Klaus Jensen 
---
 hw/block/nvme-subsys.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/hw/block/nvme-subsys.h b/hw/block/nvme-subsys.h
index 890d118117dc..574774390c4c 100644
--- a/hw/block/nvme-subsys.h
+++ b/hw/block/nvme-subsys.h
@@ -24,7 +24,7 @@ typedef struct NvmeSubsystem {
 
 NvmeCtrl*ctrls[NVME_SUBSYS_MAX_CTRLS];
 /* Allocated namespaces for this subsystem */
-NvmeNamespace *namespaces[NVME_SUBSYS_MAX_NAMESPACES];
+NvmeNamespace *namespaces[NVME_SUBSYS_MAX_NAMESPACES + 1];
 } NvmeSubsystem;
 
 int nvme_subsys_register_ctrl(NvmeCtrl *n, Error **errp);
-- 
2.25.1

[PATCH V3 1/8] hw/block/nvme: support namespace detach

2021-02-28 Thread Minwoo Im

Given that now we have nvme-subsys device supported, we can manage
namespace allocated, but not attached: detached.  This patch introduced
a parameter for nvme-ns device named 'detached'.  This parameter
indicates whether the given namespace device is detached from
a entire NVMe subsystem('subsys' given case, shared namespace) or a
controller('bus' given case, private namespace).

- Allocated namespace

  1) Shared ns in the subsystem 'subsys0':

 -device nvme-ns,id=ns1,drive=blknvme0,nsid=1,subsys=subsys0,detached=true

  2) Private ns for the controller 'nvme0' of the subsystem 'subsys0':

 -device nvme-subsys,id=subsys0
 -device nvme,serial=foo,id=nvme0,subsys=subsys0
 -device nvme-ns,id=ns1,drive=blknvme0,nsid=1,bus=nvme0,detached=true

  3) (Invalid case) Controller 'nvme0' has no subsystem to manage ns:

 -device nvme,serial=foo,id=nvme0
 -device nvme-ns,id=ns1,drive=blknvme0,nsid=1,bus=nvme0,detached=true

Signed-off-by: Minwoo Im 
Tested-by: Klaus Jensen 
Reviewed-by: Klaus Jensen 
---
 hw/block/nvme-ns.c |  1 +
 hw/block/nvme-ns.h |  1 +
 hw/block/nvme-subsys.h |  1 +
 hw/block/nvme.c| 41 +++--
 hw/block/nvme.h| 22 ++
 5 files changed, 64 insertions(+), 2 deletions(-)

diff --git a/hw/block/nvme-ns.c b/hw/block/nvme-ns.c
index 0e8760020483..eda6a0c003a4 100644
--- a/hw/block/nvme-ns.c
+++ b/hw/block/nvme-ns.c
@@ -399,6 +399,7 @@ static Property nvme_ns_props[] = {
 DEFINE_BLOCK_PROPERTIES(NvmeNamespace, blkconf),
 DEFINE_PROP_LINK("subsys", NvmeNamespace, subsys, TYPE_NVME_SUBSYS,
  NvmeSubsystem *),
+DEFINE_PROP_BOOL("detached", NvmeNamespace, params.detached, false),
 DEFINE_PROP_UINT32("nsid", NvmeNamespace, params.nsid, 0),
 DEFINE_PROP_UUID("uuid", NvmeNamespace, params.uuid),
 DEFINE_PROP_UINT16("mssrl", NvmeNamespace, params.mssrl, 128),
diff --git a/hw/block/nvme-ns.h b/hw/block/nvme-ns.h
index 7af6884862b5..b0c00e115d81 100644
--- a/hw/block/nvme-ns.h
+++ b/hw/block/nvme-ns.h
@@ -26,6 +26,7 @@ typedef struct NvmeZone {
 } NvmeZone;
 
 typedef struct NvmeNamespaceParams {
+bool detached;
 uint32_t nsid;
 QemuUUID uuid;
 
diff --git a/hw/block/nvme-subsys.h b/hw/block/nvme-subsys.h
index ccf6a71398d3..890d118117dc 100644
--- a/hw/block/nvme-subsys.h
+++ b/hw/block/nvme-subsys.h
@@ -23,6 +23,7 @@ typedef struct NvmeSubsystem {
 uint8_t subnqn[256];
 
 NvmeCtrl*ctrls[NVME_SUBSYS_MAX_CTRLS];
+/* Allocated namespaces for this subsystem */
 NvmeNamespace *namespaces[NVME_SUBSYS_MAX_NAMESPACES];
 } NvmeSubsystem;
 
diff --git a/hw/block/nvme.c b/hw/block/nvme.c
index edd0b85c10ce..f6aeae081840 100644
--- a/hw/block/nvme.c
+++ b/hw/block/nvme.c
@@ -23,7 +23,7 @@
  *  max_ioqpairs=, \
  *  aerl=, aer_max_queued=, \
  *  mdts=,zoned.append_size_limit=, \
- *  subsys= \
+ *  subsys=,detached=
  *  -device nvme-ns,drive=,bus=,nsid=,\
  *  zoned=, \
  *  subsys=
@@ -82,6 +82,13 @@
  *   controllers in the subsystem. Otherwise, `bus` must be given to attach
  *   this namespace to a specified single controller as a non-shared namespace.
  *
+ * - `detached`
+ *   Not to attach the namespace device to controllers in the NVMe subsystem
+ *   during boot-up. If not given, namespaces are all attahced to all
+ *   controllers in the subsystem by default.
+ *   It's mutual exclusive with 'bus' parameter. It's only valid in case
+ *   `subsys` is provided.
+ *
  * Setting `zoned` to true selects Zoned Command Set at the namespace.
  * In this case, the following namespace properties are available to configure
  * zoned operation:
@@ -4613,6 +4620,20 @@ static void nvme_init_state(NvmeCtrl *n)
 n->aer_reqs = g_new0(NvmeRequest *, n->params.aerl + 1);
 }
 
+static int nvme_attach_namespace(NvmeCtrl *n, NvmeNamespace *ns, Error **errp)
+{
+if (nvme_ns_is_attached(n, ns)) {
+error_setg(errp,
+   "namespace %d is already attached to controller %d",
+   nvme_nsid(ns), n->cntlid);
+return -1;
+}
+
+nvme_ns_attach(n, ns);
+
+return 0;
+}
+
 int nvme_register_namespace(NvmeCtrl *n, NvmeNamespace *ns, Error **errp)
 {
 uint32_t nsid = nvme_nsid(ns);
@@ -4644,7 +4665,23 @@ int nvme_register_namespace(NvmeCtrl *n, NvmeNamespace 
*ns, Error **errp)
 
 trace_pci_nvme_register_namespace(nsid);
 
-n->namespaces[nsid - 1] = ns;
+/*
+ * If subsys is not given, namespae is always attached to the controller
+ * because there's no subsystem to manage namespace allocation.
+ */
+if (!n->subsys) {
+if (ns->params.detached) {
+error_setg(errp,
+   "detached needs nvme-subsys specified nvme or nvme-ns");
+return -1;
+

[PATCH V3 8/8] hw/block/nvme: support Identify NS Attached Controller List

2021-02-28 Thread Minwoo Im

Support Identify command for Namespace attached controller list.  This
command handler will traverse the controller instances in the given
subsystem to figure out whether the specified nsid is attached to the
controllers or not.

The 4096bytes Identify data will return with the first entry (16bits)
indicating the number of the controller id entries.  So, the data can
hold up to 2047 entries for the controller ids.

Signed-off-by: Minwoo Im 
Tested-by: Klaus Jensen 
Reviewed-by: Klaus Jensen 
---
 hw/block/nvme.c   | 42 ++
 hw/block/trace-events |  1 +
 include/block/nvme.h  |  1 +
 3 files changed, 44 insertions(+)

diff --git a/hw/block/nvme.c b/hw/block/nvme.c
index fc06f806e58e..202fc94d0bb2 100644
--- a/hw/block/nvme.c
+++ b/hw/block/nvme.c
@@ -3286,6 +3286,46 @@ static uint16_t nvme_identify_ns(NvmeCtrl *n, 
NvmeRequest *req, bool active)
 return NVME_INVALID_CMD_SET | NVME_DNR;
 }
 
+static uint16_t nvme_identify_ns_attached_list(NvmeCtrl *n, NvmeRequest *req)
+{
+NvmeIdentify *c = (NvmeIdentify *)>cmd;
+uint16_t min_id = le16_to_cpu(c->ctrlid);
+uint16_t list[NVME_CONTROLLER_LIST_SIZE] = {};
+uint16_t *ids = [1];
+NvmeNamespace *ns;
+NvmeCtrl *ctrl;
+int cntlid, nr_ids = 0;
+
+trace_pci_nvme_identify_ns_attached_list(min_id);
+
+if (c->nsid == NVME_NSID_BROADCAST) {
+return NVME_INVALID_FIELD | NVME_DNR;
+}
+
+ns = nvme_subsys_ns(n->subsys, c->nsid);
+if (!ns) {
+return NVME_INVALID_FIELD | NVME_DNR;
+}
+
+for (cntlid = min_id; cntlid < ARRAY_SIZE(n->subsys->ctrls); cntlid++) {
+ctrl = nvme_subsys_ctrl(n->subsys, cntlid);
+if (!ctrl) {
+continue;
+}
+
+if (!nvme_ns_is_attached(ctrl, ns)) {
+continue;
+}
+
+ids[nr_ids++] = cntlid;
+}
+
+list[0] = nr_ids;
+
+return nvme_dma(n, (uint8_t *)list, sizeof(list),
+DMA_DIRECTION_FROM_DEVICE, req);
+}
+
 static uint16_t nvme_identify_ns_csi(NvmeCtrl *n, NvmeRequest *req,
 bool active)
 {
@@ -3485,6 +3525,8 @@ static uint16_t nvme_identify(NvmeCtrl *n, NvmeRequest 
*req)
 return nvme_identify_ns(n, req, true);
 case NVME_ID_CNS_NS_PRESENT:
 return nvme_identify_ns(n, req, false);
+case NVME_ID_CNS_NS_ATTACHED_CTRL_LIST:
+return nvme_identify_ns_attached_list(n, req);
 case NVME_ID_CNS_CS_NS:
 return nvme_identify_ns_csi(n, req, true);
 case NVME_ID_CNS_CS_NS_PRESENT:
diff --git a/hw/block/trace-events b/hw/block/trace-events
index 98d542c999e2..2628d69c7879 100644
--- a/hw/block/trace-events
+++ b/hw/block/trace-events
@@ -64,6 +64,7 @@ pci_nvme_del_cq(uint16_t cqid) "deleted completion queue, 
cqid=%"PRIu16""
 pci_nvme_identify_ctrl(void) "identify controller"
 pci_nvme_identify_ctrl_csi(uint8_t csi) "identify controller, csi=0x%"PRIx8""
 pci_nvme_identify_ns(uint32_t ns) "nsid %"PRIu32""
+pci_nvme_identify_ns_attached_list(uint16_t cntid) "cntid=%"PRIu16""
 pci_nvme_identify_ns_csi(uint32_t ns, uint8_t csi) "nsid=%"PRIu32", 
csi=0x%"PRIx8""
 pci_nvme_identify_nslist(uint32_t ns) "nsid %"PRIu32""
 pci_nvme_identify_nslist_csi(uint16_t ns, uint8_t csi) "nsid=%"PRIu16", 
csi=0x%"PRIx8""
diff --git a/include/block/nvme.h b/include/block/nvme.h
index eb0b31e949c2..b18945913927 100644
--- a/include/block/nvme.h
+++ b/include/block/nvme.h
@@ -971,6 +971,7 @@ enum NvmeIdCns {
 NVME_ID_CNS_CS_NS_ACTIVE_LIST = 0x07,
 NVME_ID_CNS_NS_PRESENT_LIST   = 0x10,
 NVME_ID_CNS_NS_PRESENT= 0x11,
+NVME_ID_CNS_NS_ATTACHED_CTRL_LIST = 0x12,
 NVME_ID_CNS_CS_NS_PRESENT_LIST= 0x1a,
 NVME_ID_CNS_CS_NS_PRESENT = 0x1b,
 NVME_ID_CNS_IO_COMMAND_SET= 0x1c,
-- 
2.25.1

[PATCH V3 3/8] hw/block/nvme: fix allocated namespace list to 256

2021-02-28 Thread Minwoo Im

Expand allocated namespace list (subsys->namespaces) to have 256 entries
which is a value lager than at least NVME_MAX_NAMESPACES which is for
attached namespace list in a controller.

Allocated namespace list should at least larger than attached namespace
list.

n->num_namespaces = NVME_MAX_NAMESPACES;

The above line will set the NN field by id->nn so that the subsystem
should also prepare at least this number of namespace list entries.

Signed-off-by: Minwoo Im 
Tested-by: Klaus Jensen 
Reviewed-by: Klaus Jensen 
---
 hw/block/nvme-subsys.h | 2 +-
 hw/block/nvme.h| 6 ++
 2 files changed, 7 insertions(+), 1 deletion(-)

diff --git a/hw/block/nvme-subsys.h b/hw/block/nvme-subsys.h
index 574774390c4c..8a0732b22316 100644
--- a/hw/block/nvme-subsys.h
+++ b/hw/block/nvme-subsys.h
@@ -14,7 +14,7 @@
 OBJECT_CHECK(NvmeSubsystem, (obj), TYPE_NVME_SUBSYS)
 
 #define NVME_SUBSYS_MAX_CTRLS   32
-#define NVME_SUBSYS_MAX_NAMESPACES  32
+#define NVME_SUBSYS_MAX_NAMESPACES  256
 
 typedef struct NvmeCtrl NvmeCtrl;
 typedef struct NvmeNamespace NvmeNamespace;
diff --git a/hw/block/nvme.h b/hw/block/nvme.h
index 51b8739b4d1e..7599d6b1a41b 100644
--- a/hw/block/nvme.h
+++ b/hw/block/nvme.h
@@ -10,6 +10,12 @@
 #define NVME_DEFAULT_ZONE_SIZE   (128 * MiB)
 #define NVME_DEFAULT_MAX_ZA_SIZE (128 * KiB)
 
+/*
+ * Subsystem namespace list for allocated namespaces should be larger than
+ * attached namespace list in a controller.
+ */
+QEMU_BUILD_BUG_ON(NVME_MAX_NAMESPACES > NVME_SUBSYS_MAX_NAMESPACES);
+
 typedef struct NvmeParams {
 char *serial;
 uint32_t num_queues; /* deprecated since 5.1 */
-- 
2.25.1

[PATCH V3 4/8] hw/block/nvme: support allocated namespace type

2021-02-28 Thread Minwoo Im

>From NVMe spec 1.4b "6.1.5. NSID and Namespace Relationships" defines
valid namespace types:

- Unallocated: Not exists in the NVMe subsystem
- Allocated: Exists in the NVMe subsystem
- Inactive: Not attached to the controller
- Active: Attached to the controller

This patch added support for allocated, but not attached namespace type:

!nvme_ns(n, nsid) && nvme_subsys_ns(n->subsys, nsid)

nvme_ns() returns attached namespace instance of the given controller
and nvme_subsys_ns() returns allocated namespace instance in the
subsystem.

Signed-off-by: Minwoo Im 
Tested-by: Klaus Jensen 
Reviewed-by: Klaus Jensen 
---
 hw/block/nvme-subsys.h | 13 +
 hw/block/nvme.c| 63 +++---
 2 files changed, 60 insertions(+), 16 deletions(-)

diff --git a/hw/block/nvme-subsys.h b/hw/block/nvme-subsys.h
index 8a0732b22316..14627f9ccb41 100644
--- a/hw/block/nvme-subsys.h
+++ b/hw/block/nvme-subsys.h
@@ -30,4 +30,17 @@ typedef struct NvmeSubsystem {
 int nvme_subsys_register_ctrl(NvmeCtrl *n, Error **errp);
 int nvme_subsys_register_ns(NvmeNamespace *ns, Error **errp);
 
+/*
+ * Return allocated namespace of the specified nsid in the subsystem.
+ */
+static inline NvmeNamespace *nvme_subsys_ns(NvmeSubsystem *subsys,
+uint32_t nsid)
+{
+if (!subsys) {
+return NULL;
+}
+
+return subsys->namespaces[nsid];
+}
+
 #endif /* NVME_SUBSYS_H */
diff --git a/hw/block/nvme.c b/hw/block/nvme.c
index f6aeae081840..53c4d59e09a7 100644
--- a/hw/block/nvme.c
+++ b/hw/block/nvme.c
@@ -3225,7 +3225,7 @@ static uint16_t nvme_identify_ctrl_csi(NvmeCtrl *n, 
NvmeRequest *req)
 return NVME_INVALID_FIELD | NVME_DNR;
 }
 
-static uint16_t nvme_identify_ns(NvmeCtrl *n, NvmeRequest *req)
+static uint16_t nvme_identify_ns(NvmeCtrl *n, NvmeRequest *req, bool active)
 {
 NvmeNamespace *ns;
 NvmeIdentify *c = (NvmeIdentify *)>cmd;
@@ -3239,7 +3239,14 @@ static uint16_t nvme_identify_ns(NvmeCtrl *n, 
NvmeRequest *req)
 
 ns = nvme_ns(n, nsid);
 if (unlikely(!ns)) {
-return nvme_rpt_empty_id_struct(n, req);
+if (!active) {
+ns = nvme_subsys_ns(n->subsys, nsid);
+if (!ns) {
+return nvme_rpt_empty_id_struct(n, req);
+}
+} else {
+return nvme_rpt_empty_id_struct(n, req);
+}
 }
 
 if (c->csi == NVME_CSI_NVM && nvme_csi_has_nvm_support(ns)) {
@@ -3250,7 +3257,8 @@ static uint16_t nvme_identify_ns(NvmeCtrl *n, NvmeRequest 
*req)
 return NVME_INVALID_CMD_SET | NVME_DNR;
 }
 
-static uint16_t nvme_identify_ns_csi(NvmeCtrl *n, NvmeRequest *req)
+static uint16_t nvme_identify_ns_csi(NvmeCtrl *n, NvmeRequest *req,
+bool active)
 {
 NvmeNamespace *ns;
 NvmeIdentify *c = (NvmeIdentify *)>cmd;
@@ -3264,7 +3272,14 @@ static uint16_t nvme_identify_ns_csi(NvmeCtrl *n, 
NvmeRequest *req)
 
 ns = nvme_ns(n, nsid);
 if (unlikely(!ns)) {
-return nvme_rpt_empty_id_struct(n, req);
+if (!active) {
+ns = nvme_subsys_ns(n->subsys, nsid);
+if (!ns) {
+return nvme_rpt_empty_id_struct(n, req);
+}
+} else {
+return nvme_rpt_empty_id_struct(n, req);
+}
 }
 
 if (c->csi == NVME_CSI_NVM && nvme_csi_has_nvm_support(ns)) {
@@ -3277,7 +3292,8 @@ static uint16_t nvme_identify_ns_csi(NvmeCtrl *n, 
NvmeRequest *req)
 return NVME_INVALID_FIELD | NVME_DNR;
 }
 
-static uint16_t nvme_identify_nslist(NvmeCtrl *n, NvmeRequest *req)
+static uint16_t nvme_identify_nslist(NvmeCtrl *n, NvmeRequest *req,
+bool active)
 {
 NvmeNamespace *ns;
 NvmeIdentify *c = (NvmeIdentify *)>cmd;
@@ -3302,7 +3318,14 @@ static uint16_t nvme_identify_nslist(NvmeCtrl *n, 
NvmeRequest *req)
 for (i = 1; i <= n->num_namespaces; i++) {
 ns = nvme_ns(n, i);
 if (!ns) {
-continue;
+if (!active) {
+ns = nvme_subsys_ns(n->subsys, i);
+if (!ns) {
+continue;
+}
+} else {
+continue;
+}
 }
 if (ns->params.nsid <= min_nsid) {
 continue;
@@ -3316,7 +3339,8 @@ static uint16_t nvme_identify_nslist(NvmeCtrl *n, 
NvmeRequest *req)
 return nvme_dma(n, list, data_len, DMA_DIRECTION_FROM_DEVICE, req);
 }
 
-static uint16_t nvme_identify_nslist_csi(NvmeCtrl *n, NvmeRequest *req)
+static uint16_t nvme_identify_nslist_csi(NvmeCtrl *n, NvmeRequest *req,
+bool active)
 {
 NvmeNamespace *ns;
 NvmeIdentify *c = (NvmeIdentify *)>cmd;
@@ -3342,7 +3366,14 @@ static uint16_t nvme_identify_nslist_csi(NvmeCtrl *n, 
NvmeRequest *req)
 for (i = 1; i <= n->num_namespaces; i++) {
 ns = nvme_ns(n, i);
 if (!ns) {
-continue;
+if

[PATCH V3 5/8] hw/block/nvme: refactor nvme_select_ns_iocs

2021-02-28 Thread Minwoo Im

This patch has no functional changes.  This patch just refactored
nvme_select_ns_iocs() to iterate the attached namespaces of the
controlller and make it invoke __nvme_select_ns_iocs().

Signed-off-by: Minwoo Im 
Tested-by: Klaus Jensen 
Reviewed-by: Klaus Jensen 
---
 hw/block/nvme.c | 36 +---
 1 file changed, 21 insertions(+), 15 deletions(-)

diff --git a/hw/block/nvme.c b/hw/block/nvme.c
index 53c4d59e09a7..b18ab0ef810f 100644
--- a/hw/block/nvme.c
+++ b/hw/block/nvme.c
@@ -4000,6 +4000,25 @@ static void nvme_ctrl_shutdown(NvmeCtrl *n)
 }
 }
 
+static void __nvme_select_ns_iocs(NvmeCtrl *n, NvmeNamespace *ns)
+{
+ns->iocs = nvme_cse_iocs_none;
+switch (ns->csi) {
+case NVME_CSI_NVM:
+if (NVME_CC_CSS(n->bar.cc) != NVME_CC_CSS_ADMIN_ONLY) {
+ns->iocs = nvme_cse_iocs_nvm;
+}
+break;
+case NVME_CSI_ZONED:
+if (NVME_CC_CSS(n->bar.cc) == NVME_CC_CSS_CSI) {
+ns->iocs = nvme_cse_iocs_zoned;
+} else if (NVME_CC_CSS(n->bar.cc) == NVME_CC_CSS_NVM) {
+ns->iocs = nvme_cse_iocs_nvm;
+}
+break;
+}
+}
+
 static void nvme_select_ns_iocs(NvmeCtrl *n)
 {
 NvmeNamespace *ns;
@@ -4010,21 +4029,8 @@ static void nvme_select_ns_iocs(NvmeCtrl *n)
 if (!ns) {
 continue;
 }
-ns->iocs = nvme_cse_iocs_none;
-switch (ns->csi) {
-case NVME_CSI_NVM:
-if (NVME_CC_CSS(n->bar.cc) != NVME_CC_CSS_ADMIN_ONLY) {
-ns->iocs = nvme_cse_iocs_nvm;
-}
-break;
-case NVME_CSI_ZONED:
-if (NVME_CC_CSS(n->bar.cc) == NVME_CC_CSS_CSI) {
-ns->iocs = nvme_cse_iocs_zoned;
-} else if (NVME_CC_CSS(n->bar.cc) == NVME_CC_CSS_NVM) {
-ns->iocs = nvme_cse_iocs_nvm;
-}
-break;
-}
+
+__nvme_select_ns_iocs(n, ns);
 }
 }
 
-- 
2.25.1

[PATCH V3 0/8] hw/block/nvme: support namespace attachment

2021-02-28 Thread Minwoo Im

Hello,

This series supports namespace attachment: attach and detach.  This is
the third version of series with fixing command events and asynchronous
events based on Keith's review.

Since command effects for the namespace attachment command is added in
this version, we no longer need to rescan controller after namespace
attachment command.  Kernel will rescan the controller namespaces after
the command successfully done through passthru.

Please review.

Thanks,

Since V2:
  - Added command effects (namespace inventory changed) for namespace
attach command.  (Keith)
  - Added [7/8] patch to support asynchronus event when namespace
inventory is updated.  (Keith)
  - Added review and tested tag from Klaus to all the patches, but [6/8]
and [7/8].

Since V1:
  - Fix to take 'ctrl' which is given from the command rather than 'n'.
(Klaus)
  - Add a [7/7] patch to support CNS 12h Identify command (Namespace
Attached Controller list).

Minwoo Im (8):
  hw/block/nvme: support namespace detach
  hw/block/nvme: fix namespaces array to 1-based
  hw/block/nvme: fix allocated namespace list to 256
  hw/block/nvme: support allocated namespace type
  hw/block/nvme: refactor nvme_select_ns_iocs
  hw/block/nvme: support namespace attachment command
  hw/block/nvme: support changed namespace asyncrohous event
  hw/block/nvme: support Identify NS Attached Controller List

 hw/block/nvme-ns.c |   1 +
 hw/block/nvme-ns.h |   1 +
 hw/block/nvme-subsys.h |  28 +++-
 hw/block/nvme.c| 287 -
 hw/block/nvme.h|  40 ++
 hw/block/trace-events  |   3 +
 include/block/nvme.h   |  14 ++
 7 files changed, 338 insertions(+), 36 deletions(-)

-- 
2.25.1

Re: [PATCH V2 6/7] hw/block/nvme: support namespace attachment command

2021-02-26 Thread Minwoo Im

On 21-02-27 02:59:35, Keith Busch wrote:
> On Thu, Feb 11, 2021 at 01:09:36AM +0900, Minwoo Im wrote:
> > @@ -183,6 +183,7 @@ static const uint32_t nvme_cse_acs[256] = {
> >  [NVME_ADM_CMD_SET_FEATURES] = NVME_CMD_EFF_CSUPP,
> >  [NVME_ADM_CMD_GET_FEATURES] = NVME_CMD_EFF_CSUPP,
> >  [NVME_ADM_CMD_ASYNC_EV_REQ] = NVME_CMD_EFF_CSUPP,
> > +[NVME_ADM_CMD_NS_ATTACHMENT]= NVME_CMD_EFF_CSUPP,
> 
> Missing NVME_CMD_EFF_NIC for the attachment command.

Will do that!

> >  };
> >  
> >  static const uint32_t nvme_cse_iocs_none[256];
> > @@ -3766,6 +3767,62 @@ static uint16_t nvme_aer(NvmeCtrl *n, NvmeRequest 
> > *req)
> >  return NVME_NO_COMPLETE;
> >  }
> >  
> > +static void __nvme_select_ns_iocs(NvmeCtrl *n, NvmeNamespace *ns);
> > +static uint16_t nvme_ns_attachment(NvmeCtrl *n, NvmeRequest *req)
> > +{
> > +NvmeNamespace *ns;
> > +NvmeCtrl *ctrl;
> > +uint16_t list[NVME_CONTROLLER_LIST_SIZE] = {};
> > +uint32_t nsid = le32_to_cpu(req->cmd.nsid);
> > +uint32_t dw10 = le32_to_cpu(req->cmd.cdw10);
> > +bool attach = !(dw10 & 0xf);
> > +uint16_t *nr_ids = [0];
> > +uint16_t *ids = [1];
> > +uint16_t ret;
> > +int i;
> > +
> > +trace_pci_nvme_ns_attachment(nvme_cid(req), dw10 & 0xf);
> > +
> > +ns = nvme_subsys_ns(n->subsys, nsid);
> > +if (!ns) {
> > +return NVME_INVALID_FIELD | NVME_DNR;
> > +}
> > +
> > +ret = nvme_dma(n, (uint8_t *)list, 4096,
> > +   DMA_DIRECTION_TO_DEVICE, req);
> > +if (ret) {
> > +return ret;
> > +}
> > +
> > +if (!*nr_ids) {
> > +return NVME_NS_CTRL_LIST_INVALID | NVME_DNR;
> > +}
> > +
> > +for (i = 0; i < *nr_ids; i++) {
> > +ctrl = nvme_subsys_ctrl(n->subsys, ids[i]);
> > +if (!ctrl) {
> > +return NVME_NS_CTRL_LIST_INVALID | NVME_DNR;
> > +}
> > +
> > +if (attach) {
> > +if (nvme_ns_is_attached(ctrl, ns)) {
> > +return NVME_NS_ALREADY_ATTACHED | NVME_DNR;
> > +}
> > +
> > +nvme_ns_attach(ctrl, ns);
> > +__nvme_select_ns_iocs(ctrl, ns);
> > +} else {
> > +if (!nvme_ns_is_attached(ctrl, ns)) {
> > +return NVME_NS_NOT_ATTACHED | NVME_DNR;
> > +}
> > +
> > +nvme_ns_detach(ctrl, ns);
> > +}
> > +}
> > +
> > +return NVME_SUCCESS;
> > +}
> 
> Every controller that has newly attached the namespace needs to emit the
> Namespace Notify AER in order for the host to react correctly to the
> command.

Okay. will prepare next series.

Thanks!

Re: [PATCH 2/2] hw/block/nvme: add 'nvme_ana_inject_state' HMP command

2021-02-26 Thread Minwoo Im

On 21-02-14 20:24:00, Minwoo Im wrote:
> Human Monitor Interface(HMP) is there for easy human debugging.  This
> patch added a HMP command 'nvme_ana_inject_state'.  This can be executed
> from the QEMU monitor.  This command will have the following syntax:
> 
>   # nvme_ana_inject_state   
>   (qemu) nvme_ana_inject_state nvme0 1 inaccessible
> 
> The example above will make ANA group #1 transitioned to
> ANA_INACCESSIBLE state for `nvme0` controller device.  Additionally,
> device will notify to the host that ANA has been changed via
> Asynchronous Event Notifier(AEN).  Then the host will figure out another
> path to I/O for the namespace by reading the log page for ANA
> information again, and this is what we call the multipath I/O.
> 
> This feature is good to debug the host multipath I/O by controlling the
> device ANA group state transition.  The path-related errors can be
> tested and debugged by this feature.  Also, the HMP command interafce
> will make us not to build QEMU itself again to control things in device.
> 
> This interface supports Persistent Loss state transition, but it's not
> going to be persistent: volatile of qemu perspective.
> 
> Cc: Dr . David Alan Gilbert 
> Signed-off-by: Minwoo Im 

Hello Keith,

Do you have any comments about this injection method?  As discussed
ealier, I've tried to introduce a natural way to control some of device
status like ANA state which device can behave properly.

It would be great if I can have your feedback on this :)

Thanks!

Re: [PATCH V2 0/6] hw/block/nvme: support namespace attachment

2021-02-26 Thread Minwoo Im

On 21-02-11 01:09:30, Minwoo Im wrote:
> Hello,
> 
> This series supports namespace attachment: attach and detach.  This is
> the second version series with a fix a bug on choosing a controller to
> attach for a namespace in the attach command handler.

Keith,

Could you please have a look at this series and give some comments :) ?

Re: [PATCH 2/2] hw/nvme: move device-scoped functions

2021-02-23 Thread Minwoo Im

On 21-02-09 12:08:26, Klaus Jensen wrote:
> From: Klaus Jensen 
> 
> Move a bunch of functions that are internal to a device out of the
> shared header.
> 
> Signed-off-by: Klaus Jensen 
> ---
>  hw/nvme/nvme.h | 110 +
>  hw/nvme/ctrl.c |  90 +++-
>  hw/nvme/ns.c   |   7 +++-
>  3 files changed, 97 insertions(+), 110 deletions(-)
> 
> diff --git a/hw/nvme/nvme.h b/hw/nvme/nvme.h
> index 452a64499b1b..929c6c553ca2 100644
> --- a/hw/nvme/nvme.h
> +++ b/hw/nvme/nvme.h
> @@ -96,36 +96,13 @@ static inline uint32_t nvme_nsid(NvmeNamespace *ns)
>  return -1;
>  }
>  
> -static inline bool nvme_ns_shared(NvmeNamespace *ns)
> -{
> -return !!ns->subsys;
> -}

Re-raising this up.

Something like this helper is related to the data structure usages
itself like, if ns->subsys is not NULL, it "can" mean that this
namespace is shared among controllers.  This helper represents that the
'subsys' member itself is indicating some meaning, not only just for the
subsystem itself.

That's why I've been hesitating to simply ack to this patch ;)

But, I am not strongly against to this so please make a decision with
Keith and go ahead!

Thanks!

>  static inline uint8_t nvme_ns_lbads(NvmeNamespace *ns)
>  {
> -return nvme_ns_lbaf(ns)->ds;
> -}
> +NvmeLBAF lbaf = ns->id_ns.lbaf[NVME_ID_NS_FLBAS_INDEX(ns->id_ns.flbas)];
>  
> -/* calculate the number of LBAs that the namespace can accomodate */
> -static inline uint64_t nvme_ns_nlbas(NvmeNamespace *ns)
> -{
> -return ns->size >> nvme_ns_lbads(ns);
> +return lbaf.ds;
>  }
>  
> -/* convert an LBA to the equivalent in bytes */
> -static inline size_t nvme_l2b(NvmeNamespace *ns, uint64_t lba)
> -{
> -return lba << nvme_ns_lbads(ns);
> -}
> -
> -typedef struct NvmeCtrl NvmeCtrl;
> -
>  static inline NvmeZoneState nvme_get_zone_state(NvmeZone *zone)
>  {
>  return zone->d.zs >> 4;
> @@ -136,31 +113,6 @@ static inline void nvme_set_zone_state(NvmeZone *zone, 
> NvmeZoneState state)
>  zone->d.zs = state << 4;
>  }
>  
> -static inline uint64_t nvme_zone_rd_boundary(NvmeNamespace *ns, NvmeZone 
> *zone)
> -{
> -return zone->d.zslba + ns->zone_size;
> -}
> -
> -static inline uint64_t nvme_zone_wr_boundary(NvmeZone *zone)
> -{
> -return zone->d.zslba + zone->d.zcap;
> -}
> -
> -static inline bool nvme_wp_is_valid(NvmeZone *zone)
> -{
> -uint8_t st = nvme_get_zone_state(zone);
> -
> -return st != NVME_ZONE_STATE_FULL &&
> -   st != NVME_ZONE_STATE_READ_ONLY &&
> -   st != NVME_ZONE_STATE_OFFLINE;
> -}
> -
> -static inline uint8_t *nvme_get_zd_extension(NvmeNamespace *ns,
> - uint32_t zone_idx)
> -{
> -return >zd_extensions[zone_idx * ns->params.zd_extension_size];
> -}
> -
>  static inline void nvme_aor_inc_open(NvmeNamespace *ns)
>  {
>  assert(ns->nr_open_zones >= 0);
> @@ -203,7 +155,6 @@ void nvme_ns_drain(NvmeNamespace *ns);
>  void nvme_ns_shutdown(NvmeNamespace *ns);
>  void nvme_ns_cleanup(NvmeNamespace *ns);
>  
> -
>  typedef struct NvmeParams {
>  char *serial;
>  uint32_t num_queues; /* deprecated since 5.1 */
> @@ -237,40 +188,6 @@ typedef struct NvmeRequest {
>  QTAILQ_ENTRY(NvmeRequest)entry;
>  } NvmeRequest;
>  
> -static inline const char *nvme_adm_opc_str(uint8_t opc)
> -{
> -switch (opc) {
> -case NVME_ADM_CMD_DELETE_SQ:return "NVME_ADM_CMD_DELETE_SQ";
> -case NVME_ADM_CMD_CREATE_SQ:return "NVME_ADM_CMD_CREATE_SQ";
> -case NVME_ADM_CMD_GET_LOG_PAGE: return "NVME_ADM_CMD_GET_LOG_PAGE";
> -case NVME_ADM_CMD_DELETE_CQ:return "NVME_ADM_CMD_DELETE_CQ";
> -case NVME_ADM_CMD_CREATE_CQ:return "NVME_ADM_CMD_CREATE_CQ";
> -case NVME_ADM_CMD_IDENTIFY: return "NVME_ADM_CMD_IDENTIFY";
> -case NVME_ADM_CMD_ABORT:return "NVME_ADM_CMD_ABORT";
> -case NVME_ADM_CMD_SET_FEATURES: return "NVME_ADM_CMD_SET_FEATURES";
> -case NVME_ADM_CMD_GET_FEATURES: return "NVME_ADM_CMD_GET_FEATURES";
> -case NVME_ADM_CMD_ASYNC_EV_REQ: return "NVME_ADM_CMD_ASYNC_EV_REQ";
> -default:return "NVME_ADM_CMD_UNKNOWN";
> -}
> -}
> -
> -static inline const char *nvme_io_opc_str(uint8_t opc)
> -{
> -switch (opc) {
> -case NVME_CMD_FLUSH:return "NVME_NVM_CMD_FLUSH";
> -case NVME_CMD_WRITE:return "NVME_NVM_CMD_WRITE";
> -case NVME_CMD_READ: return "NVME_NVM_CMD_READ";
> -case NVME_CMD_COMPARE:  return "NVME_NVM_CMD_COMPARE";
> -case NVME_CMD_WRITE_ZEROES: return "NVME_NVM_CMD_WRITE_ZEROES";
> -case NVME_CMD_DSM:  return "NVME_NVM_CMD_DSM";
> -case NVME_CMD_COPY: return "NVME_NVM_CMD_COPY";
> -case NVME_CMD_ZONE_MGMT_SEND:   return "NVME_ZONED_CMD_MGMT_SEND";
> -case NVME_CMD_ZONE_MGMT_RECV:   return "NVME_ZONED_CMD_MGMT_RECV";
> -case

Re: [PATCH 3/3] hw/block/nvme: report non-mdts command size limit for dsm

2021-02-22 Thread Minwoo Im

On 21-02-22 08:06:15, Klaus Jensen wrote:
> From: Gollu Appalanaidu 
> 
> Dataset Management is not subject to MDTS, but exceeded a certain size
> per range causes internal looping. Report this limit (DMRSL) in the NVM
> command set specific identify controller data structure.
> 
> Signed-off-by: Gollu Appalanaidu 
> Signed-off-by: Klaus Jensen 
> ---
>  hw/block/nvme.h   |  1 +
>  include/block/nvme.h  | 11 +++
>  hw/block/nvme.c   | 30 --
>  hw/block/trace-events |  1 +
>  4 files changed, 33 insertions(+), 10 deletions(-)
> 
> diff --git a/hw/block/nvme.h b/hw/block/nvme.h
> index cb2b5175f1a1..3046b82b3da1 100644
> --- a/hw/block/nvme.h
> +++ b/hw/block/nvme.h
> @@ -172,6 +172,7 @@ typedef struct NvmeCtrl {
>  int aer_queued;
>  
>  uint8_t zasl;
> +uint32_tdmrsl;
>  
>  NvmeSubsystem   *subsys;
>  
> diff --git a/include/block/nvme.h b/include/block/nvme.h
> index b23f3ae2279f..16d8c4c90f7e 100644
> --- a/include/block/nvme.h
> +++ b/include/block/nvme.h
> @@ -1041,6 +1041,16 @@ typedef struct NvmeIdCtrlZoned {
>  uint8_t rsvd1[4095];
>  } NvmeIdCtrlZoned;
>  
> +typedef struct NvmeIdCtrlNvm {
> +uint8_t vsl;
> +uint8_t wzsl;
> +uint8_t wusl;
> +uint8_t dmrl;
> +uint32_tdmrsl;
> +uint64_tdmsl;
> +uint8_t rsvd16[4080];
> +} NvmeIdCtrlNvm;
> +
>  enum NvmeIdCtrlOacs {
>  NVME_OACS_SECURITY  = 1 << 0,
>  NVME_OACS_FORMAT= 1 << 1,
> @@ -1396,6 +1406,7 @@ static inline void _nvme_check_size(void)
>  QEMU_BUILD_BUG_ON(sizeof(NvmeEffectsLog) != 4096);
>  QEMU_BUILD_BUG_ON(sizeof(NvmeIdCtrl) != 4096);
>  QEMU_BUILD_BUG_ON(sizeof(NvmeIdCtrlZoned) != 4096);
> +QEMU_BUILD_BUG_ON(sizeof(NvmeIdCtrlNvm) != 4096);
>  QEMU_BUILD_BUG_ON(sizeof(NvmeLBAF) != 4);
>  QEMU_BUILD_BUG_ON(sizeof(NvmeLBAFE) != 16);
>  QEMU_BUILD_BUG_ON(sizeof(NvmeIdNs) != 4096);
> diff --git a/hw/block/nvme.c b/hw/block/nvme.c
> index 897b9ff0db91..5d6bba5fcb0d 100644
> --- a/hw/block/nvme.c
> +++ b/hw/block/nvme.c
> @@ -1777,6 +1777,10 @@ static uint16_t nvme_dsm(NvmeCtrl *n, NvmeRequest *req)
>  trace_pci_nvme_dsm_deallocate(nvme_cid(req), nvme_nsid(ns), slba,
>nlb);
>  
> +if (nlb > n->dmrsl) {
> +trace_pci_nvme_dsm_single_range_limit_exceeded(nlb, 
> n->dmrsl);
> +}
> +
>  offset = nvme_l2b(ns, slba);
>  len = nvme_l2b(ns, nlb);
>  
> @@ -3202,21 +3206,24 @@ static uint16_t nvme_identify_ctrl(NvmeCtrl *n, 
> NvmeRequest *req)
>  static uint16_t nvme_identify_ctrl_csi(NvmeCtrl *n, NvmeRequest *req)
>  {
>  NvmeIdentify *c = (NvmeIdentify *)>cmd;
> -NvmeIdCtrlZoned id = {};
> +uint8_t id[NVME_IDENTIFY_DATA_SIZE] = {};
>  
>  trace_pci_nvme_identify_ctrl_csi(c->csi);
>  
> -if (c->csi == NVME_CSI_NVM) {
> -return nvme_rpt_empty_id_struct(n, req);
> -} else if (c->csi == NVME_CSI_ZONED) {
> -if (n->params.zasl_bs) {
> -id.zasl = n->zasl;
> -}
> -return nvme_dma(n, (uint8_t *), sizeof(id),
> -DMA_DIRECTION_FROM_DEVICE, req);
> +switch (c->csi) {
> +case NVME_CSI_NVM:
> +((NvmeIdCtrlNvm *))->dmrsl = cpu_to_le32(n->dmrsl);
> +break;
> +
> +case NVME_CSI_ZONED:
> +((NvmeIdCtrlZoned *))->zasl = n->zasl;

Question.  Are we okay without checking this like above ? :)

if (n->params.zasl_bs) {
((NvmeIdCtrlZoned *))->zasl = n->zasl;
}

> +break;
> +
> +default:
> +return NVME_INVALID_FIELD | NVME_DNR;
>  }
>  
> -return NVME_INVALID_FIELD | NVME_DNR;
> +return nvme_dma(n, id, sizeof(id), DMA_DIRECTION_FROM_DEVICE, req);
>  }
>  
>  static uint16_t nvme_identify_ns(NvmeCtrl *n, NvmeRequest *req)
> @@ -4670,6 +4677,9 @@ int nvme_register_namespace(NvmeCtrl *n, NvmeNamespace 
> *ns, Error **errp)
>  
>  n->namespaces[nsid - 1] = ns;
>  
> +n->dmrsl = MIN_NON_ZERO(n->dmrsl,
> +BDRV_REQUEST_MAX_BYTES / nvme_l2b(ns, 1));
> +
>  return 0;
>  }
>  
> diff --git a/hw/block/trace-events b/hw/block/trace-events
> index 1f958d09d2a9..27940fe2e98a 100644
> --- a/hw/block/trace-events
> +++ b/hw/block/trace-events
> @@ -51,6 +51,7 @@ pci_nvme_copy_cb(uint16_t cid) "cid %"PRIu16""
>  pci_nvme_block_status(int64_t offset, int64_t bytes, int64_t pnum, int ret, 
> bool zeroed) "offset %"PRId64" bytes %"PRId64" pnum %"PRId64" ret 0x%x zeroed 
> %d"
>  pci_nvme_dsm(uint16_t cid, uint32_t nsid, uint32_t nr, uint32_t attr) "cid 
> %"PRIu16" nsid %"PRIu32" nr %"PRIu32" attr 0x%"PRIx32""
>  pci_nvme_dsm_deallocate(uint16_t cid, uint32_t nsid, uint64_t slba, uint32_t 
> nlb) "cid %"PRIu16" nsid %"PRIu32" slba %"PRIu64" nlb %"PRIu32""
> +pci_nvme_dsm_single_range_limit_exceeded(uint32_t nlb, uint32_t dmrsl) "nlb 
> %"PRIu32" dmrsl %"PRIu32""
>

Re: [PATCH 2/3] hw/block/nvme: fix potential compilation error

2021-02-22 Thread Minwoo Im

On 21-02-22 08:06:14, Klaus Jensen wrote:
> From: Gollu Appalanaidu 
> 
> assert may be compiled to a noop and we could end up returning an
> uninitialized status.
> 
> Fix this by always returning Internal Device Error as a fallback.
> 
> Signed-off-by: Gollu Appalanaidu 
> Signed-off-by: Klaus Jensen 
> ---
>  hw/block/nvme.c | 11 +--
>  1 file changed, 5 insertions(+), 6 deletions(-)
> 
> diff --git a/hw/block/nvme.c b/hw/block/nvme.c
> index ddc83f7f7a19..897b9ff0db91 100644
> --- a/hw/block/nvme.c
> +++ b/hw/block/nvme.c
> @@ -1232,7 +1232,7 @@ static uint16_t nvme_check_zone_write(NvmeNamespace 
> *ns, NvmeZone *zone,
>  
>  static uint16_t nvme_check_zone_state_for_read(NvmeZone *zone)
>  {
> -uint16_t status;
> +uint64_t zslba = zone->d.zslba;
>  
>  switch (nvme_get_zone_state(zone)) {
>  case NVME_ZONE_STATE_EMPTY:
> @@ -1241,16 +1241,15 @@ static uint16_t 
> nvme_check_zone_state_for_read(NvmeZone *zone)
>  case NVME_ZONE_STATE_FULL:
>  case NVME_ZONE_STATE_CLOSED:
>  case NVME_ZONE_STATE_READ_ONLY:
> -status = NVME_SUCCESS;
> -break;
> +return NVME_SUCCESS;
>  case NVME_ZONE_STATE_OFFLINE:
> -status = NVME_ZONE_OFFLINE;
> -break;
> +trace_pci_nvme_err_zone_is_offline(zslba);

This also is a tiny addition to the potential error fix.  Anyway, it can
be shorten to: (if zslba is used in a place only)

trace_pci_nvme_err_zone_is_offline(zone->d.zslba);

> +return NVME_ZONE_OFFLINE;
>  default:
>  assert(false);
>  }
>  
> -return status;
> +return NVME_INTERNAL_DEV_ERROR;
>  }
>  
>  static uint16_t nvme_check_zone_read(NvmeNamespace *ns, uint64_t slba,
> -- 
> 2.30.1
> 
>

Re: [PATCH 1/3] hw/block/nvme: nvme_identify fixes

2021-02-22 Thread Minwoo Im

On 21-02-22 08:06:13, Klaus Jensen wrote:
> From: Gollu Appalanaidu 
> 
> Remove an unnecessary le_to_cpu conversion and add trace event for
> Identify.
> 
> Signed-off-by: Gollu Appalanaidu 
> Signed-off-by: Klaus Jensen 
> ---
>  hw/block/nvme.c   | 5 -
>  hw/block/trace-events | 1 +
>  2 files changed, 5 insertions(+), 1 deletion(-)
> 
> diff --git a/hw/block/nvme.c b/hw/block/nvme.c
> index 1cd82fa3c9fe..ddc83f7f7a19 100644
> --- a/hw/block/nvme.c
> +++ b/hw/block/nvme.c
> @@ -3415,7 +3415,10 @@ static uint16_t nvme_identify(NvmeCtrl *n, NvmeRequest 
> *req)
>  {
>  NvmeIdentify *c = (NvmeIdentify *)>cmd;
>  
> -switch (le32_to_cpu(c->cns)) {
> +trace_pci_nvme_identify(nvme_cid(req), c->cns, le16_to_cpu(c->ctrlid),
> +c->csi);

I think it would be great if it can be separated into two.
Anyway, changes look good to me.

Reviewed-by: Minwoo Im 

> +
> +switch (c->cns) {
>  case NVME_ID_CNS_NS:
>   /* fall through */
>  case NVME_ID_CNS_NS_PRESENT:
> diff --git a/hw/block/trace-events b/hw/block/trace-events
> index b04f7a3e1890..1f958d09d2a9 100644
> --- a/hw/block/trace-events
> +++ b/hw/block/trace-events
> @@ -61,6 +61,7 @@ pci_nvme_create_sq(uint64_t addr, uint16_t sqid, uint16_t 
> cqid, uint16_t qsize,
>  pci_nvme_create_cq(uint64_t addr, uint16_t cqid, uint16_t vector, uint16_t 
> size, uint16_t qflags, int ien) "create completion queue, addr=0x%"PRIx64", 
> cqid=%"PRIu16", vector=%"PRIu16", qsize=%"PRIu16", qflags=%"PRIu16", ien=%d"
>  pci_nvme_del_sq(uint16_t qid) "deleting submission queue sqid=%"PRIu16""
>  pci_nvme_del_cq(uint16_t cqid) "deleted completion queue, cqid=%"PRIu16""
> +pci_nvme_identify(uint16_t cid, uint8_t cns, uint16_t ctrlid, uint8_t csi) 
> "cid %"PRIu16" cns 0x%"PRIx8" ctrlid %"PRIu16" csi 0x%"PRIx8""
>  pci_nvme_identify_ctrl(void) "identify controller"
>  pci_nvme_identify_ctrl_csi(uint8_t csi) "identify controller, csi=0x%"PRIx8""
>  pci_nvme_identify_ns(uint32_t ns) "nsid %"PRIu32""
> -- 
> 2.30.1
> 
>

[PATCH V2 1/1] hw/block/nvme: support command retry delay

2021-02-14 Thread Minwoo Im

Set CRDT1(Command Retry Delay Time 1) in the Identify controller data
structure to milliseconds units of 100ms by the given value of
'cmd-retry-delay' parameter which is newly added.  If
cmd-retry-delay=1000, it will be set CRDT1 to 10.  This patch only
considers the CRDT1 without CRDT2 and 3 for the simplicity.

This patch also introduced set/get feature command handler for Host
Behavior feature (16h).  In this feature, ACRE(Advanced Command Retry
Enable) will be set by the host based on the Identify controller data
structure, especially by CRDTs.

Signed-off-by: Minwoo Im 
---
 hw/block/nvme.c  | 68 +++-
 hw/block/nvme.h  |  2 ++
 include/block/nvme.h | 13 -
 3 files changed, 81 insertions(+), 2 deletions(-)

diff --git a/hw/block/nvme.c b/hw/block/nvme.c
index 1cd82fa3c9fe..5aedb26cea9e 100644
--- a/hw/block/nvme.c
+++ b/hw/block/nvme.c
@@ -23,7 +23,7 @@
  *  max_ioqpairs=, \
  *  aerl=, aer_max_queued=, \
  *  mdts=,zoned.append_size_limit=, \
- *  subsys= \
+ *  subsys=,cmd-retry-delay= \
  *  -device nvme-ns,drive=,bus=,nsid=,\
  *  zoned=, \
  *  subsys=
@@ -71,6 +71,14 @@
  *   data size being in effect. By setting this property to 0, users can make
  *   ZASL to be equal to MDTS. This property only affects zoned namespaces.
  *
+ * - `cmd-retry-delay`
+ *   Command Retry Delay value in unit of millisecond.  This value will be
+ *   reported to the CRDT1(Command Retry Delay Time 1) in Identify Controller
+ *   data structure in 100 milliseconds unit.  If this is not given, DNR(Do Not
+ *   Retry) bit field in the Status field of CQ entry.  If it's given to 0,
+ *   CRD(Command Retry Delay) will be set to 0 which is for retry without
+ *   delay.  Otherwise, it will set to 1 to delay for CRDT1 value.
+ *
  * nvme namespace device parameters
  * 
  * - `subsys`
@@ -154,6 +162,7 @@ static const bool nvme_feature_support[NVME_FID_MAX] = {
 [NVME_WRITE_ATOMICITY]  = true,
 [NVME_ASYNCHRONOUS_EVENT_CONF]  = true,
 [NVME_TIMESTAMP]= true,
+[NVME_HOST_BEHAVIOR_SUPPORT]= true,
 };
 
 static const uint32_t nvme_feature_cap[NVME_FID_MAX] = {
@@ -163,6 +172,7 @@ static const uint32_t nvme_feature_cap[NVME_FID_MAX] = {
 [NVME_NUMBER_OF_QUEUES] = NVME_FEAT_CAP_CHANGE,
 [NVME_ASYNCHRONOUS_EVENT_CONF]  = NVME_FEAT_CAP_CHANGE,
 [NVME_TIMESTAMP]= NVME_FEAT_CAP_CHANGE,
+[NVME_HOST_BEHAVIOR_SUPPORT]= NVME_FEAT_CAP_CHANGE,
 };
 
 static const uint32_t nvme_cse_acs[256] = {
@@ -942,9 +952,30 @@ static void nvme_post_cqes(void *opaque)
 }
 }
 
+static void nvme_setup_crdt(NvmeCtrl *n, NvmeRequest *req)
+{
+if (!n->features.acre) {
+return;
+}
+
+/*
+ * We just support CRDT1 for now without considering CRDT2 and CRDT3.
+ * Also, regardless to the status code, if there's no NVME_DNR, then we
+ * set up the command retry delay.
+ */
+if (req->status && !(req->status & NVME_DNR)) {
+if (n->params.cmd_retry_delay) {
+req->status |= NVME_CRD_CRDT1;
+}
+}
+}
+
 static void nvme_enqueue_req_completion(NvmeCQueue *cq, NvmeRequest *req)
 {
 assert(cq->cqid == req->sq->cqid);
+
+nvme_setup_crdt(cq->ctrl, req);
+
 trace_pci_nvme_enqueue_req_completion(nvme_cid(req), cq->cqid,
   req->status);
 
@@ -3501,6 +3532,16 @@ static uint16_t nvme_get_feature_timestamp(NvmeCtrl *n, 
NvmeRequest *req)
 DMA_DIRECTION_FROM_DEVICE, req);
 }
 
+static uint16_t nvme_get_feature_host_behavior(NvmeCtrl *n, NvmeRequest *req)
+{
+NvmeFeatureHostBehavior data = {};
+
+data.acre = n->features.acre;
+
+return nvme_dma(n, (uint8_t *), sizeof(data),
+DMA_DIRECTION_FROM_DEVICE, req);
+}
+
 static uint16_t nvme_get_feature(NvmeCtrl *n, NvmeRequest *req)
 {
 NvmeCmd *cmd = >cmd;
@@ -3607,6 +3648,8 @@ static uint16_t nvme_get_feature(NvmeCtrl *n, NvmeRequest 
*req)
 goto out;
 case NVME_TIMESTAMP:
 return nvme_get_feature_timestamp(n, req);
+case NVME_HOST_BEHAVIOR_SUPPORT:
+return nvme_get_feature_host_behavior(n, req);
 default:
 break;
 }
@@ -3670,6 +3713,22 @@ static uint16_t nvme_set_feature_timestamp(NvmeCtrl *n, 
NvmeRequest *req)
 return NVME_SUCCESS;
 }
 
+static uint16_t nvme_set_feature_host_behavior(NvmeCtrl *n, NvmeRequest *req)
+{
+NvmeFeatureHostBehavior data;
+int ret;
+
+ret = nvme_dma(n, (uint8_t *), sizeof(data),
+   DMA_DIRECTION_TO_DEVICE, req);
+if (ret) {
+return ret;
+}
+
+n->features.acre = data.acre;
+
+return NVME_SUCCESS;
+}
+
 static uint16_t nvme_set_feature(NvmeCtrl *n, NvmeRequest *req)
 {
 NvmeNamespace *ns = NULL;
@@ -3801,6 +

[PATCH V2 0/1] hw/block/nvme: support command retry

2021-02-14 Thread Minwoo Im

Hello,

This series has been discussed and reviewed in [1].  This is the second
series to support Advanced Command Retry Enable(ACRE).

At the first shot, It was designed to provide HMP commands to inject
artificial state to the NVMe device.  But, as discussed, rather than
making a device with a artificial state, more natural way is needed:
Some states triggered by lower layer of the command processing like
AIO errors.  Therefore, rather than providing HMP command, This version
just checks the NVME_DNR bit from the status, then it will enable the
retry delay field(CRDT).

Since RFC V1:
  [Klaus, I didn't put your review tag due to the following changes ;)]
  - Remove [1/3] patch because there are already !NVME_DNR error cases
in nvme_aio_err(). (Klaus)
  - Remove [3/3] patch to trigger retry status situation more naturally
not injecting the intended state to the NVMe device. (Keith)
  - Remove nvme_should_retry() by not considering the status code,
especially NVME_COMMAND_INTERRUPTED.
  - Change `cmd-retry-delay` param type to uint32_t because we don't
need to check if it's given or not.  So, zero can be started with.

[1] https://lists.nongnu.org/archive/html/qemu-block/2021-02/msg00843.html

Minwoo Im (1):
  hw/block/nvme: support command retry delay

 hw/block/nvme.c  | 68 +++-
 hw/block/nvme.h  |  2 ++
 include/block/nvme.h | 13 -
 3 files changed, 81 insertions(+), 2 deletions(-)

-- 
2.17.1

[PATCH 2/2] hw/block/nvme: add 'nvme_ana_inject_state' HMP command

2021-02-14 Thread Minwoo Im

Human Monitor Interface(HMP) is there for easy human debugging.  This
patch added a HMP command 'nvme_ana_inject_state'.  This can be executed
from the QEMU monitor.  This command will have the following syntax:

# nvme_ana_inject_state   
(qemu) nvme_ana_inject_state nvme0 1 inaccessible

The example above will make ANA group #1 transitioned to
ANA_INACCESSIBLE state for `nvme0` controller device.  Additionally,
device will notify to the host that ANA has been changed via
Asynchronous Event Notifier(AEN).  Then the host will figure out another
path to I/O for the namespace by reading the log page for ANA
information again, and this is what we call the multipath I/O.

This feature is good to debug the host multipath I/O by controlling the
device ANA group state transition.  The path-related errors can be
tested and debugged by this feature.  Also, the HMP command interafce
will make us not to build QEMU itself again to control things in device.

This interface supports Persistent Loss state transition, but it's not
going to be persistent: volatile of qemu perspective.

Cc: Dr . David Alan Gilbert 
Signed-off-by: Minwoo Im 
---
 hmp-commands.hx   | 13 ++
 hw/block/nvme.c   | 93 +++
 hw/block/nvme.h   |  1 +
 include/block/nvme.h  |  1 +
 include/monitor/hmp.h |  1 +
 5 files changed, 109 insertions(+)

diff --git a/hmp-commands.hx b/hmp-commands.hx
index d4001f9c5dc6..5a099191429d 100644
--- a/hmp-commands.hx
+++ b/hmp-commands.hx
@@ -1307,6 +1307,19 @@ SRST
   Inject PCIe AER error
 ERST
 
+{
+.name   = "nvme_ana_inject_state",
+.args_type  = "id:s,grpid:i,state:s",
+.params = "id grpid [optimized|non-optimized|inaccessible|change]",
+.help   = "inject ANA state",
+.cmd= hmp_nvme_ana_inject_state,
+},
+
+SRST
+``nvme_ana_inject_state``
+  Inject ANA state to NVMe subsystem
+ERST
+
 {
 .name   = "netdev_add",
 .args_type  = "netdev:O",
diff --git a/hw/block/nvme.c b/hw/block/nvme.c
index 8f5c2c1ab7f7..b40fd3230b8d 100644
--- a/hw/block/nvme.c
+++ b/hw/block/nvme.c
@@ -131,6 +131,8 @@
 #include "qemu/log.h"
 #include "qemu/module.h"
 #include "qemu/cutils.h"
+#include "qapi/qmp/qdict.h"
+#include "monitor/monitor.h"
 #include "trace.h"
 #include "nvme.h"
 #include "nvme-ns.h"
@@ -229,6 +231,46 @@ static uint16_t nvme_sqid(NvmeRequest *req)
 return le16_to_cpu(req->sq->sqid);
 }
 
+static void nvme_notice_event(NvmeCtrl *n, uint8_t event_info);
+static bool nvme_ana_state_change(NvmeCtrl *n, uint32_t grpid, uint8_t state)
+{
+uint8_t old;
+
+old = n->ana[grpid].state;
+
+if (state == old) {
+return false;
+}
+
+n->ana[grpid].state = state;
+n->ana_change_count++;
+nvme_notice_event(n, NVME_AER_INFO_ANA_CHANGE);
+
+return true;
+}
+
+static const char *nvme_ana_states[] = {
+[NVME_ANA_STATE_OPTIMIZED]  = "optimized",
+[NVME_ANA_STATE_NON_OPTIMIZED]  = "non-optimized",
+[NVME_ANA_STATE_INACCESSIBLE]   = "inaccessible",
+[NVME_ANA_STATE_PERSISTENT_LOSS]= "persistent-loss",
+[NVME_ANA_STATE_CHANGE] = "change",
+};
+
+static inline bool nvme_ana_state_valid(uint8_t state)
+{
+switch (state) {
+case NVME_ANA_STATE_OPTIMIZED:
+case NVME_ANA_STATE_NON_OPTIMIZED:
+case NVME_ANA_STATE_INACCESSIBLE:
+case NVME_ANA_STATE_PERSISTENT_LOSS:
+case NVME_ANA_STATE_CHANGE:
+return true;
+default:
+return false;
+}
+}
+
 static inline uint16_t nvme_ana_check_state(uint8_t state)
 {
 switch (state) {
@@ -243,6 +285,42 @@ static inline uint16_t nvme_ana_check_state(uint8_t state)
 }
 }
 
+void hmp_nvme_ana_inject_state(Monitor *mon, const QDict *qdict)
+{
+const char *id = qdict_get_str(qdict, "id");
+const uint32_t grpid = qdict_get_int(qdict, "grpid");
+const char *state = qdict_get_str(qdict, "state");
+NvmeCtrl *n;
+DeviceState *dev;
+int i;
+
+dev = qdev_find_recursive(sysbus_get_default(), id);
+if (!dev) {
+monitor_printf(mon, "%s: invalid device id\n", id);
+return;
+}
+
+if (!grpid) {
+monitor_printf(mon, "%s: grpid should not be 0\n", id);
+return;
+}
+
+n = NVME(dev);
+
+for (i = 0; i < ARRAY_SIZE(nvme_ana_states); i++) {
+if (nvme_ana_state_valid(i) &&
+!strcmp(nvme_ana_states[i], state)) {
+if (nvme_ana_state_change(n, grpid, i)) {
+monitor_printf(mon, "%s: ANA state %s(%d) injected\n",
+   id, state, i);
+}
+return;
+}
+}
+
+

[PATCH 1/2] hw/block/nvme: support Asymmetric Namespace Access(ANA)

2021-02-14 Thread Minwoo Im

Recently we have been able to introduce multipath I/O with namespace
sharing with a single host.  One of the major feature for the multipath
I/O is Asymmetric Namespace Access: so-called ANA.

Also, the multipath I/O is one of the most major path for the host
system where this feature addition might be helpful to debug for.

This patch introduced ANA scheme to the nvme-subsys, nvme and nvme-ns
device.  `ana` device parameter should be given to true if want want to
support ANA feature for all controllers in the subsystem:

  -device nvme-subsys,id=subsys0,ana=true

This patch also introduced ANA Group which is defined in 8.20.2 ANA
Groups in NVMe 1.4b spec.  ANA Group can be specified by the user with
`ana.grpid` parameter (non-zero value) to nvme-ns device:

  -device nvme-ns,id=ns1,drv=drv1,subsys=subsys0,ana.grpid=1

These relationships are communicated with the host by Get Log Page
command for the ANA information.  The ANA log information is retrieved
based on the relationship between controller and namespace with ANA
group.

This patch does not contain the ANA state transition by the device
itself.  The following patch will support the ANA state transition from
the device side.

Signed-off-by: Minwoo Im 
---
 hw/block/nvme-ns.c |  26 
 hw/block/nvme-ns.h |   3 +
 hw/block/nvme-subsys.c |   6 ++
 hw/block/nvme-subsys.h |   6 ++
 hw/block/nvme.c| 147 -
 hw/block/nvme.h|  25 +++
 include/block/nvme.h   |  50 +-
 7 files changed, 260 insertions(+), 3 deletions(-)

diff --git a/hw/block/nvme-ns.c b/hw/block/nvme-ns.c
index 0e8760020483..1bfc50eb1035 100644
--- a/hw/block/nvme-ns.c
+++ b/hw/block/nvme-ns.c
@@ -75,6 +75,31 @@ static int nvme_ns_init(NvmeNamespace *ns, Error **errp)
 return 0;
 }
 
+int nvme_ns_post_init(NvmeCtrl *n, NvmeNamespace *ns, Error **errp)
+{
+NvmeIdNs *id_ns = >id_ns;
+
+if (n->subsys && !n->subsys->params.ana && ns->params.anagrpid) {
+error_setg(errp, "anagrpid needs 'ana=true' in nvme subsystem");
+return -1;
+}
+
+if (ns->params.anagrpid && !nvme_ns_shared(ns)) {
+error_setg(errp, "anagrpid needs nvme-ns device shared");
+return -1;
+}
+
+if (ns->params.anagrpid > n->id_ctrl.anagrpmax) {
+error_setg(errp,
+   "anagrpid should be less than %u", n->id_ctrl.anagrpmax);
+return -1;
+}
+
+id_ns->anagrpid = ns->params.anagrpid;
+
+return 0;
+}
+
 static int nvme_ns_init_blk(NvmeNamespace *ns, Error **errp)
 {
 bool read_only;
@@ -417,6 +442,7 @@ static Property nvme_ns_props[] = {
params.max_open_zones, 0),
 DEFINE_PROP_UINT32("zoned.descr_ext_size", NvmeNamespace,
params.zd_extension_size, 0),
+DEFINE_PROP_UINT32("ana.grpid", NvmeNamespace, params.anagrpid, 0),
 DEFINE_PROP_END_OF_LIST(),
 };
 
diff --git a/hw/block/nvme-ns.h b/hw/block/nvme-ns.h
index 7af6884862b5..b3ca6176f4ce 100644
--- a/hw/block/nvme-ns.h
+++ b/hw/block/nvme-ns.h
@@ -40,8 +40,10 @@ typedef struct NvmeNamespaceParams {
 uint32_t max_active_zones;
 uint32_t max_open_zones;
 uint32_t zd_extension_size;
+uint32_t anagrpid;
 } NvmeNamespaceParams;
 
+typedef struct NvmeAna NvmeAna;
 typedef struct NvmeNamespace {
 DeviceState  parent_obj;
 BlockConfblkconf;
@@ -185,6 +187,7 @@ static inline void nvme_aor_dec_active(NvmeNamespace *ns)
 assert(ns->nr_active_zones >= 0);
 }
 
+int nvme_ns_post_init(NvmeCtrl *n, NvmeNamespace *ns, Error **errp);
 int nvme_ns_setup(NvmeNamespace *ns, Error **errp);
 void nvme_ns_drain(NvmeNamespace *ns);
 void nvme_ns_shutdown(NvmeNamespace *ns);
diff --git a/hw/block/nvme-subsys.c b/hw/block/nvme-subsys.c
index 641de33e99fc..72fcf4cc6966 100644
--- a/hw/block/nvme-subsys.c
+++ b/hw/block/nvme-subsys.c
@@ -81,6 +81,11 @@ static void nvme_subsys_realize(DeviceState *dev, Error 
**errp)
 nvme_subsys_setup(subsys);
 }
 
+static Property nvme_subsys_props[] = {
+DEFINE_PROP_BOOL("ana", NvmeSubsystem, params.ana, false),
+DEFINE_PROP_END_OF_LIST(),
+};
+
 static void nvme_subsys_class_init(ObjectClass *oc, void *data)
 {
 DeviceClass *dc = DEVICE_CLASS(oc);
@@ -89,6 +94,7 @@ static void nvme_subsys_class_init(ObjectClass *oc, void 
*data)
 
 dc->realize = nvme_subsys_realize;
 dc->desc = "Virtual NVMe subsystem";
+device_class_set_props(dc, nvme_subsys_props);
 }
 
 static const TypeInfo nvme_subsys_info = {
diff --git a/hw/block/nvme-subsys.h b/hw/block/nvme-subsys.h
index ccf6a71398d3..7d177312e554 100644
--- a/hw/block/nvme-subsys.h
+++ b/hw/block/nvme-subsys.h
@@ -16,6 +16,10 @@
 #define NVME_SUBSYS_MAX_CTRLS   32
 #define NVME_SUBSYS_MAX_NAMESPACES  32
 
+typedef struct NvmeSubsystemParams {
+bool ana;
+} NvmeSubsystemParams;
+
 t

[PATCH 0/2] hw/block/nvme: support ANA

2021-02-14 Thread Minwoo Im

Hello,

This series is to support Asymmetric Namespace Access(ANA).

`ana` parameter to the `nvme-subsys` device will turn all the
controllers in the subsystem to support multipath I/O with ANA.  Once
ana is enabled, `nvme-ns` device can have `ana.grpid` for the ANA group
ID.  All this information will be reported as a log information via Get
Log Page command of ANA page.

But, this is just for ANA state considered, not including any
transitions for ANA states between controllers and namespaces.
Therefore, the second patch introduced an interface to transite the ANA
state for a given ANA group via HMP command.  `nvme_ana_inject_state`
command will inject a given state to the given ANA group.

The following example breaks the path to the namespace(nsid=1) by
chainging the ANA group(ana.grpid=1) state to INACCESSIBLE.  Once this
state is injected by HMP interface, Asynchronous Event Notifier(AEN)
will be reported to the host for ANA change.  Then host will read the
log page again and find a path to the namespace(nsid=1) not through the
`nvme0` controller: `nvme` controller will be taken.  Then all the I/O
heading to namespace(nsid=1) will be routed to the `nvme1` controller.

Example:

  NVMe subsystem topology

-device nvme-subsys,id=subsys0,ana=true \
-device nvme,serial=foo,id=nvme0,subsys=subsys0 \
-device nvme,serial=bar,id=nvme1,subsys=subsys0 \
-device nvme-ns,id=ns1,drive=drv10,nsid=1,subsys=subsys0,ana.grpid=1 \
-device nvme-ns,id=ns2,drive=drv11,nsid=2,subsys=subsys0,ana.grpid=2 \

  ANA state transition (HMP command)

(qemu) nvme_ana_inject_state nvme0 1 inaccessible
nvme0: ANA state inaccessible(3) injected

Thanks,

Minwoo Im (2):
  hw/block/nvme: support Asymmetric Namespace Access(ANA)
  hw/block/nvme: add 'nvme_ana_inject_state' HMP command

 hmp-commands.hx|  13 +++
 hw/block/nvme-ns.c |  26 +
 hw/block/nvme-ns.h |   3 +
 hw/block/nvme-subsys.c |   6 ++
 hw/block/nvme-subsys.h |   6 ++
 hw/block/nvme.c| 240 -
 hw/block/nvme.h|  26 +
 include/block/nvme.h   |  51 -
 include/monitor/hmp.h  |   1 +
 9 files changed, 369 insertions(+), 3 deletions(-)

-- 
2.17.1

Re: [PATCH] hw/block/nvme: drain namespaces on sq deletion

2021-02-11 Thread Minwoo Im

On 21-02-11 13:07:08, Klaus Jensen wrote:
> On Feb 11 11:49, Minwoo Im wrote:
> > On 21-01-27 14:15:05, Klaus Jensen wrote:
> > > From: Klaus Jensen 
> > > 
> > > For most commands, when issuing an AIO, the BlockAIOCB is stored in the
> > > NvmeRequest aiocb pointer when the AIO is issued. The purpose of storing
> > > this is to allow the AIO to be cancelled when deleting submission
> > > queues (it is currently not used for Abort).
> > > 
> > > Since the addition of the Dataset Management command and Zoned
> > > Namespaces, NvmeRequests may involve more than one AIO and the AIOs are
> > > issued without saving a reference to the BlockAIOCB. This is a problem
> > > since nvme_del_sq will attempt to cancel outstanding AIOs, potentially
> > > with an invalid BlockAIOCB.
> > > 
> > > Fix this by instead of explicitly cancelling the requests, just allow
> > > the AIOs to complete by draining the namespace blockdevs.
> > > 
> > > Signed-off-by: Klaus Jensen 
> > > ---
> > >  hw/block/nvme.c | 18 +-
> > >  1 file changed, 13 insertions(+), 5 deletions(-)
> > > 
> > > diff --git a/hw/block/nvme.c b/hw/block/nvme.c
> > > index 316858fd8adf..91f6fb6da1e2 100644
> > > --- a/hw/block/nvme.c
> > > +++ b/hw/block/nvme.c
> > > @@ -403,6 +403,7 @@ static void nvme_req_clear(NvmeRequest *req)
> > >  {
> > >  req->ns = NULL;
> > >  req->opaque = NULL;
> > > +req->aiocb = NULL;
> > >  memset(>cqe, 0x0, sizeof(req->cqe));
> > >  req->status = NVME_SUCCESS;
> > >  }
> > > @@ -2396,6 +2397,7 @@ static uint16_t nvme_del_sq(NvmeCtrl *n, 
> > > NvmeRequest *req)
> > >  NvmeSQueue *sq;
> > >  NvmeCQueue *cq;
> > >  uint16_t qid = le16_to_cpu(c->qid);
> > > +int i;
> > >  
> > >  if (unlikely(!qid || nvme_check_sqid(n, qid))) {
> > >  trace_pci_nvme_err_invalid_del_sq(qid);
> > > @@ -2404,12 +2406,18 @@ static uint16_t nvme_del_sq(NvmeCtrl *n, 
> > > NvmeRequest *req)
> > >  
> > >  trace_pci_nvme_del_sq(qid);
> > >  
> > > -sq = n->sq[qid];
> > > -while (!QTAILQ_EMPTY(>out_req_list)) {
> > > -r = QTAILQ_FIRST(>out_req_list);
> > > -assert(r->aiocb);
> > > -blk_aio_cancel(r->aiocb);
> > > +for (i = 1; i <= n->num_namespaces; i++) {
> > > +NvmeNamespace *ns = nvme_ns(n, i);
> > > +if (!ns) {
> > > +continue;
> > > +}
> > > +
> > > +nvme_ns_drain(ns);
> > 
> > If we just drain the entire namespaces here, commands which has nothing
> > to do with the target sq to be deleted will be drained.  And this might
> > be a burden for a single SQ deletion.
> > 
> 
> That is true. But how often would you dynamically delete and create I/O
> submission queues in the fast path?

Delete I/O queues are not that often in the working NVMe controller, but
it might be a good case for the exception test from the host side like:
I/O queue deletion during I/O workloads.  If delete I/O queues are
returning by aborting their own requests only and quickly respond to the
host, then I think it might be a good one to test with.  Handling
requests gracefully sometimes don't cause corner cases from the host
point-of-view.  But, QEMU is not only for the host testing, so I am not
sure that QEMU NVMe device should handle things gracefully or try to do
something exactly as the real hardware(but, we don't know all the
hardware behavior ;)).

(But, Right. If I'm only talking about the kernel, then kernel does not
delete queues during the fast-path hot workloads.  But it's sometimes
great to test something on their own driver or application)

> > By the way, agree with the multiple AIOs references problem for newly added
> > commands.  But, shouldn't we manage the inflight AIO request references for
> > the newlly added commands with some other way and kill them all
> > explicitly as it was?  Maybe some of list for AIOCBs?
> 
> I was hesitant to add more stuff to NvmeRequest (like a QTAILQ to track
> this). Getting a steady-state with draining was an easy fix.

Graceful handling is easy to go with.  I am not expert for the overall
purpose of the QEMU NVMe device model, but I'm curious that which one we
need to take first between `Easy to go vs. What device should do`.

Re: [PATCH] hw/block/nvme: fix error handling in nvme_ns_realize

2021-02-11 Thread Minwoo Im

Reviewed-by: Minwoo Im

Re: [PATCH] hw/block/nvme: fix legacy namespace registration

2021-02-11 Thread Minwoo Im

Thanks Klaus,

Reviewed-by: Minwoo Im

Re: [RFC PATCH 3/3] hw/block/nvme: add nvme_inject_state HMP command

2021-02-10 Thread Minwoo Im

On 21-02-11 13:24:22, Keith Busch wrote:
> On Thu, Feb 11, 2021 at 12:38:48PM +0900, Minwoo Im wrote:
> > On 21-02-11 12:00:11, Keith Busch wrote:
> > > But I would prefer to see advanced retry tied to real errors that can be
> > > retried, like if we got an EBUSY or EAGAIN errno or something like that.
> > 
> > I have seen a thread [1] about ACRE.  Forgive me If I misunderstood this
> > thread or missed something after this thread.  It looks like CRD field in
> > the CQE can be set for any NVMe error state which means it *may* depend on
> > the device status.
> 
> Right! Setting CRD values is at the controller's discretion for any
> error status as long as the host enables ACRE.
> 
> > And this patch just introduced a internal temporarily error state of
> > the controller by returning Command Intrrupted status.
> 
> It's just purely synthetic, though. I was hoping something more natural
> could trigger the status. That might not provide the deterministic
> scenario you're looking for, though.

That makes snese.  If some status can be triggered more naturally,  that
would be much better.

> I'm not completely against using QEMU as a development/test vehicle for
> corner cases like this, but we are introducing a whole lot of knobs
> recently, and you practically need to be a QEMU developer to even find
> them. We probably should step up the documentation in the wiki along
> with these types of features.

Oh, that's a really good advice, really appreciate that one.

> > I think, in this stage, we can go with some errors in the middle of the
> > AIO (nvme_aio_err()) for advanced retry.  Shouldn't AIO errors are
> > retry-able and supposed to be retried ?
> 
> Sure, we can assume that receiving an error in the AIO callback means
> the lower layers exhausted available recovery mechanisms.

Okay, please let me find a way to trigger this kind of errors more
naturally.  I think this HMP command should be the last one to try if
there's nothing we can do really.

Re: [RFC PATCH 1/3] hw/block/nvme: set NVME_DNR in a single place

2021-02-10 Thread Minwoo Im

On 21-02-10 21:19:43, Klaus Jensen wrote:
> On Feb 11 04:52, Minwoo Im wrote:
> > @@ -945,6 +945,11 @@ static void nvme_post_cqes(void *opaque)
> >  static void nvme_enqueue_req_completion(NvmeCQueue *cq, NvmeRequest *req)
> >  {
> >  assert(cq->cqid == req->sq->cqid);
> > +
> > +if (req->status != NVME_SUCCESS) {
> > +req->status |= NVME_DNR;
> > +}
> 
> There are status codes where we do not set the DNR bit (e.g. Data
> Transfer Error, and that might be the only one actually).

Ouch, I think I need to prepare some of switch-helper to figure out
which one needs to be retried or not.

> Maybe a switch such that we do not explicitly set DNR for Data Transfer
> Error (and any other errors we identify), but only if we set it earlier
> in the stack.

Agreed.

Re: [RFC PATCH 3/3] hw/block/nvme: add nvme_inject_state HMP command

2021-02-10 Thread Minwoo Im

On 21-02-11 12:00:11, Keith Busch wrote:
> On Thu, Feb 11, 2021 at 04:52:52AM +0900, Minwoo Im wrote:
> > nvme_inject_state command is to give a controller state to be.
> > Human Monitor Interface(HMP) supports users to make controller to a
> > specified state of:
> > 
> > normal: Normal state (no injection)
> > cmd-interrupted:Commands will be interrupted internally
> > 
> > This patch is just a start to give dynamic command from the HMP to the
> > QEMU NVMe device model.  If "cmd-interrupted" state is given, then the
> > controller will return all the CQ entries with Command Interrupts status
> > code.
> > 
> > Usage:
> > -device nvme,id=nvme0,
> > 
> > (qemu) nvme_inject_state nvme0 cmd-interrupted
> > 
> > 
> > 
> > (qemu) nvme_inject_state nvme0 normal
> > 
> > This feature is required to test Linux kernel NVMe driver for the
> > command retry feature.
> 
> Once the user sets the injected state, all commands return that status
> until the user injects the normal state, so the CRD time is meaningless
> here. If we're really going this route, the state needs to return to
> normal on it's own.

That would also be fine to me.

> But I would prefer to see advanced retry tied to real errors that can be
> retried, like if we got an EBUSY or EAGAIN errno or something like that.

I have seen a thread [1] about ACRE.  Forgive me If I misunderstood this
thread or missed something after this thread.  It looks like CRD field in
the CQE can be set for any NVMe error state which means it *may* depend on
the device status.  And this patch just introduced a internal temporarily
error state of the controller by returning Command Intrrupted status.

I think, in this stage, we can go with some errors in the middle of the
AIO (nvme_aio_err()) for advanced retry.  Shouldn't AIO errors are
retry-able and supposed to be retried ?

> The interface you found to implement this is very interesting though.

Thanks, I just wanted to suggest a scheme to inject something to a
running NVMe device model for various testing.

[1] https://www.spinics.net/lists/dm-devel/msg42165.html

Re: [RFC PATCH 3/3] hw/block/nvme: add nvme_inject_state HMP command

2021-02-10 Thread Minwoo Im

On 21-02-10 21:33:50, Klaus Jensen wrote:
> On Feb 11 04:52, Minwoo Im wrote:
> > nvme_inject_state command is to give a controller state to be.
> > Human Monitor Interface(HMP) supports users to make controller to a
> > specified state of:
> > 
> > normal: Normal state (no injection)
> > cmd-interrupted:Commands will be interrupted internally
> > 
> > This patch is just a start to give dynamic command from the HMP to the
> > QEMU NVMe device model.  If "cmd-interrupted" state is given, then the
> > controller will return all the CQ entries with Command Interrupts status
> > code.
> > 
> > Usage:
> > -device nvme,id=nvme0,
> > 
> > (qemu) nvme_inject_state nvme0 cmd-interrupted
> > 
> > 
> > 
> > (qemu) nvme_inject_state nvme0 normal
> > 
> > This feature is required to test Linux kernel NVMe driver for the
> > command retry feature.
> > 
> 
> This is super cool and commands like this feel much nicer than the
> qom-set approach that the SMART critical warning feature took.

This interface is super easy to inject some errors to the running
device with a function call-back.

> But... looking at the existing commands I don't think we can "bloat" it
> up with a device specific command like this, but I don't know the policy
> around this.

Me neither, but I've seen the PCI AER error injection feature from
the existing commands, so I suggested this command to control the
NVMe device itself like error injection.

> If an HMP command is out, then we should be able to make do with the
> qom-set approach just fine though.

Hope so.

Re: [PATCH 2/2] hw/nvme: move device-scoped functions

2021-02-10 Thread Minwoo Im

On 21-02-09 12:08:26, Klaus Jensen wrote:
> From: Klaus Jensen 
> 
> Move a bunch of functions that are internal to a device out of the
> shared header.

May I ask why?  I think some kind of these helpers can stick onto the
header.

Re: [PATCH 1/2] hw/nvme: move nvme emulation out of hw/block

2021-02-10 Thread Minwoo Im

On 21-02-09 12:08:25, Klaus Jensen wrote:
> From: Klaus Jensen 
> 
> With the introduction of the nvme-subsystem device we are really
> cluttering up the hw/block directory.
> 
> As suggested by Philippe previously, move the nvme emulation to
> hw/nvme.
> 
> Suggested-by: Philippe Mathieu-Daudé 
> Signed-off-by: Klaus Jensen 

Please add description about consolidation of nvme-ns.h and
nvme-subsys.h to the nvme.h for a unified header file when you apply
this patch! :)

Acked-by: Minwoo Im

Re: [PATCH] hw/block/nvme: drain namespaces on sq deletion

2021-02-10 Thread Minwoo Im

On 21-01-27 14:15:05, Klaus Jensen wrote:
> From: Klaus Jensen 
> 
> For most commands, when issuing an AIO, the BlockAIOCB is stored in the
> NvmeRequest aiocb pointer when the AIO is issued. The purpose of storing
> this is to allow the AIO to be cancelled when deleting submission
> queues (it is currently not used for Abort).
> 
> Since the addition of the Dataset Management command and Zoned
> Namespaces, NvmeRequests may involve more than one AIO and the AIOs are
> issued without saving a reference to the BlockAIOCB. This is a problem
> since nvme_del_sq will attempt to cancel outstanding AIOs, potentially
> with an invalid BlockAIOCB.
> 
> Fix this by instead of explicitly cancelling the requests, just allow
> the AIOs to complete by draining the namespace blockdevs.
> 
> Signed-off-by: Klaus Jensen 
> ---
>  hw/block/nvme.c | 18 +-
>  1 file changed, 13 insertions(+), 5 deletions(-)
> 
> diff --git a/hw/block/nvme.c b/hw/block/nvme.c
> index 316858fd8adf..91f6fb6da1e2 100644
> --- a/hw/block/nvme.c
> +++ b/hw/block/nvme.c
> @@ -403,6 +403,7 @@ static void nvme_req_clear(NvmeRequest *req)
>  {
>  req->ns = NULL;
>  req->opaque = NULL;
> +req->aiocb = NULL;
>  memset(>cqe, 0x0, sizeof(req->cqe));
>  req->status = NVME_SUCCESS;
>  }
> @@ -2396,6 +2397,7 @@ static uint16_t nvme_del_sq(NvmeCtrl *n, NvmeRequest 
> *req)
>  NvmeSQueue *sq;
>  NvmeCQueue *cq;
>  uint16_t qid = le16_to_cpu(c->qid);
> +int i;
>  
>  if (unlikely(!qid || nvme_check_sqid(n, qid))) {
>  trace_pci_nvme_err_invalid_del_sq(qid);
> @@ -2404,12 +2406,18 @@ static uint16_t nvme_del_sq(NvmeCtrl *n, NvmeRequest 
> *req)
>  
>  trace_pci_nvme_del_sq(qid);
>  
> -sq = n->sq[qid];
> -while (!QTAILQ_EMPTY(>out_req_list)) {
> -r = QTAILQ_FIRST(>out_req_list);
> -assert(r->aiocb);
> -blk_aio_cancel(r->aiocb);
> +for (i = 1; i <= n->num_namespaces; i++) {
> +NvmeNamespace *ns = nvme_ns(n, i);
> +if (!ns) {
> +continue;
> +}
> +
> +nvme_ns_drain(ns);

If we just drain the entire namespaces here, commands which has nothing
to do with the target sq to be deleted will be drained.  And this might
be a burden for a single SQ deletion.

By the way, agree with the multiple AIOs references problem for newly added
commands.  But, shouldn't we manage the inflight AIO request references for
the newlly added commands with some other way and kill them all
explicitly as it was?  Maybe some of list for AIOCBs?

[RFC PATCH 1/3] hw/block/nvme: set NVME_DNR in a single place

2021-02-10 Thread Minwoo Im

Set NVME_DNR in the CQ entry status field right before writing the CQ
entry: in nvme_post_cqes().  We have put NVME_DNR for all CQ entry
status for all error cases.  This patch is a former patch to support
command retry feature.

Signed-off-by: Minwoo Im 
---
 hw/block/nvme.c | 192 
 1 file changed, 97 insertions(+), 95 deletions(-)

diff --git a/hw/block/nvme.c b/hw/block/nvme.c
index 93345bf3c1fc..816e0e8e5205 100644
--- a/hw/block/nvme.c
+++ b/hw/block/nvme.c
@@ -270,12 +270,12 @@ static int nvme_aor_check(NvmeNamespace *ns, uint32_t 
act, uint32_t opn)
 if (ns->params.max_active_zones != 0 &&
 ns->nr_active_zones + act > ns->params.max_active_zones) {
 trace_pci_nvme_err_insuff_active_res(ns->params.max_active_zones);
-return NVME_ZONE_TOO_MANY_ACTIVE | NVME_DNR;
+return NVME_ZONE_TOO_MANY_ACTIVE;
 }
 if (ns->params.max_open_zones != 0 &&
 ns->nr_open_zones + opn > ns->params.max_open_zones) {
 trace_pci_nvme_err_insuff_open_res(ns->params.max_open_zones);
-return NVME_ZONE_TOO_MANY_OPEN | NVME_DNR;
+return NVME_ZONE_TOO_MANY_OPEN;
 }
 
 return NVME_SUCCESS;
@@ -492,7 +492,7 @@ static uint16_t nvme_map_addr(NvmeCtrl *n, QEMUSGList *qsg, 
QEMUIOVector *iov,
 
 if (cmb || pmr) {
 if (qsg && qsg->sg) {
-return NVME_INVALID_USE_OF_CMB | NVME_DNR;
+return NVME_INVALID_USE_OF_CMB;
 }
 
 assert(iov);
@@ -509,7 +509,7 @@ static uint16_t nvme_map_addr(NvmeCtrl *n, QEMUSGList *qsg, 
QEMUIOVector *iov,
 }
 
 if (iov && iov->iov) {
-return NVME_INVALID_USE_OF_CMB | NVME_DNR;
+return NVME_INVALID_USE_OF_CMB;
 }
 
 assert(qsg);
@@ -568,7 +568,7 @@ static uint16_t nvme_map_prp(NvmeCtrl *n, uint64_t prp1, 
uint64_t prp2,
 if (i == n->max_prp_ents - 1 && len > n->page_size) {
 if (unlikely(prp_ent & (n->page_size - 1))) {
 trace_pci_nvme_err_invalid_prplist_ent(prp_ent);
-return NVME_INVALID_PRP_OFFSET | NVME_DNR;
+return NVME_INVALID_PRP_OFFSET;
 }
 
 i = 0;
@@ -585,7 +585,7 @@ static uint16_t nvme_map_prp(NvmeCtrl *n, uint64_t prp1, 
uint64_t prp2,
 
 if (unlikely(prp_ent & (n->page_size - 1))) {
 trace_pci_nvme_err_invalid_prplist_ent(prp_ent);
-return NVME_INVALID_PRP_OFFSET | NVME_DNR;
+return NVME_INVALID_PRP_OFFSET;
 }
 
 trans_len = MIN(len, n->page_size);
@@ -600,7 +600,7 @@ static uint16_t nvme_map_prp(NvmeCtrl *n, uint64_t prp1, 
uint64_t prp2,
 } else {
 if (unlikely(prp2 & (n->page_size - 1))) {
 trace_pci_nvme_err_invalid_prp2_align(prp2);
-return NVME_INVALID_PRP_OFFSET | NVME_DNR;
+return NVME_INVALID_PRP_OFFSET;
 }
 status = nvme_map_addr(n, qsg, iov, prp2, len);
 if (status) {
@@ -637,9 +637,9 @@ static uint16_t nvme_map_sgl_data(NvmeCtrl *n, QEMUSGList 
*qsg,
 break;
 case NVME_SGL_DESCR_TYPE_SEGMENT:
 case NVME_SGL_DESCR_TYPE_LAST_SEGMENT:
-return NVME_INVALID_NUM_SGL_DESCRS | NVME_DNR;
+return NVME_INVALID_NUM_SGL_DESCRS;
 default:
-return NVME_SGL_DESCR_TYPE_INVALID | NVME_DNR;
+return NVME_SGL_DESCR_TYPE_INVALID;
 }
 
 dlen = le32_to_cpu(segment[i].len);
@@ -660,7 +660,7 @@ static uint16_t nvme_map_sgl_data(NvmeCtrl *n, QEMUSGList 
*qsg,
 }
 
 trace_pci_nvme_err_invalid_sgl_excess_length(nvme_cid(req));
-return NVME_DATA_SGL_LEN_INVALID | NVME_DNR;
+return NVME_DATA_SGL_LEN_INVALID;
 }
 
 trans_len = MIN(*len, dlen);
@@ -672,7 +672,7 @@ static uint16_t nvme_map_sgl_data(NvmeCtrl *n, QEMUSGList 
*qsg,
 addr = le64_to_cpu(segment[i].addr);
 
 if (UINT64_MAX - addr < dlen) {
-return NVME_DATA_SGL_LEN_INVALID | NVME_DNR;
+return NVME_DATA_SGL_LEN_INVALID;
 }
 
 status = nvme_map_addr(n, qsg, iov, addr, trans_len);
@@ -731,7 +731,7 @@ static uint16_t nvme_map_sgl(NvmeCtrl *n, QEMUSGList *qsg, 
QEMUIOVector *iov,
 case NVME_SGL_DESCR_TYPE_LAST_SEGMENT:
 break;
 default:
-return NVME_INVALID_SGL_SEG_DESCR | NVME_DNR;
+return NVME_INVALID_SGL_SEG_DESCR;
 }
 
 seg_len = le32_to_cpu(sgld->len);
@@ -739,11 +739,11 @@ static uint16_t nvme_map_sgl(NvmeCtrl *n, QEMUSGList 
*qsg, QEMUIOVector *iov,
 /* check the length of the (Last) Segment descriptor */
 if ((!seg_len || seg_len & 0xf) &&
 (NVME_SGL_TYPE(s

[RFC PATCH 3/3] hw/block/nvme: add nvme_inject_state HMP command

2021-02-10 Thread Minwoo Im

nvme_inject_state command is to give a controller state to be.
Human Monitor Interface(HMP) supports users to make controller to a
specified state of:

normal: Normal state (no injection)
cmd-interrupted:Commands will be interrupted internally

This patch is just a start to give dynamic command from the HMP to the
QEMU NVMe device model.  If "cmd-interrupted" state is given, then the
controller will return all the CQ entries with Command Interrupts status
code.

Usage:
-device nvme,id=nvme0,

(qemu) nvme_inject_state nvme0 cmd-interrupted



(qemu) nvme_inject_state nvme0 normal

This feature is required to test Linux kernel NVMe driver for the
command retry feature.

Signed-off-by: Minwoo Im 
---
 hmp-commands.hx   | 13 
 hw/block/nvme.c   | 49 +++
 hw/block/nvme.h   |  8 +++
 include/monitor/hmp.h |  1 +
 4 files changed, 71 insertions(+)

diff --git a/hmp-commands.hx b/hmp-commands.hx
index d4001f9c5dc6..ef288c567b46 100644
--- a/hmp-commands.hx
+++ b/hmp-commands.hx
@@ -1307,6 +1307,19 @@ SRST
   Inject PCIe AER error
 ERST
 
+{
+.name   = "nvme_inject_state",
+.args_type  = "id:s,state:s",
+.params = "id [normal|cmd-interrupted]",
+.help   = "inject controller/namespace state",
+.cmd= hmp_nvme_inject_state,
+},
+
+SRST
+``nvme_inject_state``
+  Inject NVMe controller/namespace state
+ERST
+
 {
 .name   = "netdev_add",
 .args_type  = "netdev:O",
diff --git a/hw/block/nvme.c b/hw/block/nvme.c
index 6d3c554a0e99..42cf5bd113e6 100644
--- a/hw/block/nvme.c
+++ b/hw/block/nvme.c
@@ -123,6 +123,7 @@
 #include "sysemu/sysemu.h"
 #include "qapi/error.h"
 #include "qapi/visitor.h"
+#include "qapi/qmp/qdict.h"
 #include "sysemu/hostmem.h"
 #include "sysemu/block-backend.h"
 #include "exec/memory.h"
@@ -132,6 +133,7 @@
 #include "trace.h"
 #include "nvme.h"
 #include "nvme-ns.h"
+#include "monitor/monitor.h"
 
 #define NVME_MAX_IOQPAIRS 0x
 #define NVME_DB_SIZE  4
@@ -966,6 +968,14 @@ static void nvme_enqueue_req_completion(NvmeCQueue *cq, 
NvmeRequest *req)
 {
 assert(cq->cqid == req->sq->cqid);
 
+/*
+ * Override request status field if controller state has been injected by
+ * the QMP.
+ */
+if (cq->ctrl->state == NVME_STATE_CMD_INTERRUPTED) {
+req->status = NVME_COMMAND_INTERRUPTED;
+}
+
 if (req->status != NVME_SUCCESS) {
 if (cq->ctrl->features.acre && nvme_should_retry(req)) {
 if (cq->ctrl->params.cmd_retry_delay > 0) {
@@ -5025,4 +5035,43 @@ static void nvme_register_types(void)
 type_register_static(_bus_info);
 }
 
+static void nvme_inject_state(NvmeCtrl *n, NvmeState state)
+{
+n->state = state;
+}
+
+static const char *nvme_states[] = {
+[NVME_STATE_NORMAL] = "normal",
+[NVME_STATE_CMD_INTERRUPTED]= "cmd-interrupted",
+};
+
+void hmp_nvme_inject_state(Monitor *mon, const QDict *qdict)
+{
+const char *id = qdict_get_str(qdict, "id");
+const char *state = qdict_get_str(qdict, "state");
+PCIDevice *dev;
+NvmeCtrl *n;
+int ret, i;
+
+ret = pci_qdev_find_device(id, );
+if (ret < 0) {
+monitor_printf(mon, "invalid device id %s\n", id);
+return;
+}
+
+n = NVME(dev);
+
+for (i = 0; i < ARRAY_SIZE(nvme_states); i++) {
+if (!strcmp(nvme_states[i], state)) {
+nvme_inject_state(n, i);
+monitor_printf(mon,
+   "-device nvme,id=%s: state %s injected\n",
+   id, state);
+return;
+}
+}
+
+monitor_printf(mon, "invalid state %s\n", state);
+}
+
 type_init(nvme_register_types)
diff --git a/hw/block/nvme.h b/hw/block/nvme.h
index 37940b3ac2d2..1af1e0380d9b 100644
--- a/hw/block/nvme.h
+++ b/hw/block/nvme.h
@@ -128,6 +128,11 @@ typedef struct NvmeFeatureVal {
 uint8_t acre;
 } NvmeFeatureVal;
 
+typedef enum NvmeState {
+NVME_STATE_NORMAL,
+NVME_STATE_CMD_INTERRUPTED,
+} NvmeState;
+
 typedef struct NvmeCtrl {
 PCIDeviceparent_obj;
 MemoryRegion bar0;
@@ -185,6 +190,8 @@ typedef struct NvmeCtrl {
 NvmeCQueue  admin_cq;
 NvmeIdCtrl  id_ctrl;
 NvmeFeatureVal  features;
+
+NvmeState   state;
 } NvmeCtrl;
 
 static inline NvmeNamespace *nvme_ns(NvmeCtrl *n, uint32_t nsid)
@@ -212,4 +219,5 @@ static inline NvmeCtrl *nvme_ctrl(NvmeRequest *req)
 
 int nvme_register_namespace(NvmeCtrl *n, NvmeNamespace *ns, Error **errp);
 
+void hmp_nvme_inject_state(Monitor *mon, const QDict *qdict);

[RFC PATCH 2/3] hw/block/nvme: support command retry delay

2021-02-10 Thread Minwoo Im

Set CRDT1(Command Retry Delay Time 1) in the Identify controller data
structure to milliseconds units of 100ms by the given value of
'cmd-retry-delay' parameter which is newly added.  If
cmd-retry-delay=1000, it will be set CRDT1 to 10.  This patch only
considers the CRDT1 without CRDT2 and 3 for the simplicity.

This patch also introduced set/get feature command handler for Host
Behavior feature (16h).  In this feature, ACRE(Advanced Command Retry
Enable) will be set by the host based on the Identify controller data
structure, especially by CRDTs.

If 'cmd-retry-delay' is not given, the default value will be -1 which is
CRDT will not be configured at all and ACRE will not be supported.  In
this case, we just set NVME_DNR to the error CQ entry just like we used
to.  If it's given to positive value, then ACRE will be supported by the
device.

Signed-off-by: Minwoo Im 
---
 hw/block/nvme.c  | 65 ++--
 hw/block/nvme.h  |  2 ++
 include/block/nvme.h | 13 -
 3 files changed, 77 insertions(+), 3 deletions(-)

diff --git a/hw/block/nvme.c b/hw/block/nvme.c
index 816e0e8e5205..6d3c554a0e99 100644
--- a/hw/block/nvme.c
+++ b/hw/block/nvme.c
@@ -23,7 +23,7 @@
  *  max_ioqpairs=, \
  *  aerl=, aer_max_queued=, \
  *  mdts=,zoned.append_size_limit=, \
- *  subsys= \
+ *  subsys=,cmd-retry-delay= \
  *  -device nvme-ns,drive=,bus=,nsid=,\
  *  zoned=, \
  *  subsys=
@@ -71,6 +71,14 @@
  *   data size being in effect. By setting this property to 0, users can make
  *   ZASL to be equal to MDTS. This property only affects zoned namespaces.
  *
+ * - `cmd-retry-delay`
+ *   Command Retry Delay value in unit of millisecond.  This value will be
+ *   reported to the CRDT1(Command Retry Delay Time 1) in Identify Controller
+ *   data structure in 100 milliseconds unit.  If this is not given, DNR(Do Not
+ *   Retry) bit field in the Status field of CQ entry.  If it's given to 0,
+ *   CRD(Command Retry Delay) will be set to 0 which is for retry without
+ *   delay.  Otherwise, it will set to 1 to delay for CRDT1 value.
+ *
  * nvme namespace device parameters
  * 
  * - `subsys`
@@ -154,6 +162,7 @@ static const bool nvme_feature_support[NVME_FID_MAX] = {
 [NVME_WRITE_ATOMICITY]  = true,
 [NVME_ASYNCHRONOUS_EVENT_CONF]  = true,
 [NVME_TIMESTAMP]= true,
+[NVME_HOST_BEHAVIOR_SUPPORT]= true,
 };
 
 static const uint32_t nvme_feature_cap[NVME_FID_MAX] = {
@@ -163,6 +172,7 @@ static const uint32_t nvme_feature_cap[NVME_FID_MAX] = {
 [NVME_NUMBER_OF_QUEUES] = NVME_FEAT_CAP_CHANGE,
 [NVME_ASYNCHRONOUS_EVENT_CONF]  = NVME_FEAT_CAP_CHANGE,
 [NVME_TIMESTAMP]= NVME_FEAT_CAP_CHANGE,
+[NVME_HOST_BEHAVIOR_SUPPORT]= NVME_FEAT_CAP_CHANGE,
 };
 
 static const uint32_t nvme_cse_acs[256] = {
@@ -904,6 +914,16 @@ static uint16_t nvme_dma(NvmeCtrl *n, uint8_t *ptr, 
uint32_t len,
 return status;
 }
 
+static inline bool nvme_should_retry(NvmeRequest *req)
+{
+switch (req->status) {
+case NVME_COMMAND_INTERRUPTED:
+return true;
+default:
+return false;
+}
+}
+
 static void nvme_post_cqes(void *opaque)
 {
 NvmeCQueue *cq = opaque;
@@ -947,7 +967,13 @@ static void nvme_enqueue_req_completion(NvmeCQueue *cq, 
NvmeRequest *req)
 assert(cq->cqid == req->sq->cqid);
 
 if (req->status != NVME_SUCCESS) {
-req->status |= NVME_DNR;
+if (cq->ctrl->features.acre && nvme_should_retry(req)) {
+if (cq->ctrl->params.cmd_retry_delay > 0) {
+req->status |= NVME_CRD_CRDT1;
+}
+} else {
+req->status |= NVME_DNR;
+}
 }
 
 trace_pci_nvme_enqueue_req_completion(nvme_cid(req), cq->cqid,
@@ -3401,6 +3427,16 @@ static uint16_t nvme_get_feature_timestamp(NvmeCtrl *n, 
NvmeRequest *req)
 DMA_DIRECTION_FROM_DEVICE, req);
 }
 
+static uint16_t nvme_get_feature_host_behavior(NvmeCtrl *n, NvmeRequest *req)
+{
+NvmeFeatureHostBehavior data = {};
+
+data.acre = n->features.acre;
+
+return nvme_dma(n, (uint8_t *), sizeof(data),
+DMA_DIRECTION_FROM_DEVICE, req);
+}
+
 static uint16_t nvme_get_feature(NvmeCtrl *n, NvmeRequest *req)
 {
 NvmeCmd *cmd = >cmd;
@@ -3506,6 +3542,8 @@ static uint16_t nvme_get_feature(NvmeCtrl *n, NvmeRequest 
*req)
 goto out;
 case NVME_TIMESTAMP:
 return nvme_get_feature_timestamp(n, req);
+case NVME_HOST_BEHAVIOR_SUPPORT:
+return nvme_get_feature_host_behavior(n, req);
 default:
 break;
 }
@@ -3569,6 +3607,22 @@ static uint16_t nvme_set_feature_timestamp(NvmeCtrl *n, 
NvmeRequest *req)
 return NVME_SUCCESS;
 }
 
+static uint16_t nvme_set_feature_host_behavior(NvmeCtrl *n, NvmeRequest *req)
+{
+

[RFC PATCH 0/3] support command retry

2021-02-10 Thread Minwoo Im

Hello,

This series is RFC about supporting command retry feature in NVMe device
model.  The background to propose this feature is that in kernel
development and testing, retry scheme has not been able to be covered in
QEMU NVMe model device.  If we are able to control the retry scheme
fromt he device side, it would be nice for kernel developers to test.

We have been putting NVME_DNR in the CQ entry status field for
all error cases.  This series added a control for the command retry
based on the 'cmd-retry-delay' parameter which is newly added.  If it's
given to positive value, Command Retry Delay Time1(CRDT1) in the
Identify Controller data structure will be set in 100msec units.
Accordingly, it will cause host to Set Feature with Host Behavior(0x16)
to enable the Advanced Command Retry Enable(ACRE) feature to support
command retry with defined delay.  If 'cmd-retry-delay' param is given
to 0, then command failures will be retried directly without delay.

This series just considered Command Interrupted status code first which
is mainly about the ACRE feature addition.  nvme_should_retry() helper
will decide command should be retried or not.

But, we don't have any use-cases specified for the Command Interrupted
status code in the device model.  So, I proposed [3/3] patch by adding
'nvme_inject_state' HMP command to make users to give pre-defined state
to the controller device by injecting it via QEMU monitor.

Usage:

  # Configure the nvme0 device to be retried every 1sec(1000msec)
  -device nvme,id=nvme0,cmd-retry-delay=1000,...

  (qemu) nvme_inject_state nvme0 cmd-interrupted
  -device nvme,id=nvme0: state cmd-interrupted injected
  (qemu)

  # Then from now on, controller will interrupt all the commands
  # to be processed with Command Interrupted status code.  Then host
  # will retry based on the delay.

Thanks,

Minwoo Im (3):
  hw/block/nvme: set NVME_DNR in a single place
  hw/block/nvme: support command retry delay
  hw/block/nvme: add nvme_inject_state HMP command

 hmp-commands.hx   |  13 ++
 hw/block/nvme.c   | 304 +-
 hw/block/nvme.h   |  10 ++
 include/block/nvme.h  |  13 +-
 include/monitor/hmp.h |   1 +
 5 files changed, 244 insertions(+), 97 deletions(-)

-- 
2.17.1

[PATCH V2 6/7] hw/block/nvme: support namespace attachment command

2021-02-10 Thread Minwoo Im

This patch supports Namespace Attachment command for the pre-defined
nvme-ns device nodes.  Of course, attach/detach namespace should only be
supported in case 'subsys' is given.  This is because if we detach a
namespace from a controller, somebody needs to manage the detached, but
allocated namespace in the NVMe subsystem.

Signed-off-by: Minwoo Im 
---
 hw/block/nvme-subsys.h | 10 +++
 hw/block/nvme.c| 59 ++
 hw/block/nvme.h|  5 
 hw/block/trace-events  |  2 ++
 include/block/nvme.h   |  5 
 5 files changed, 81 insertions(+)

diff --git a/hw/block/nvme-subsys.h b/hw/block/nvme-subsys.h
index 14627f9ccb41..ef4bec928eae 100644
--- a/hw/block/nvme-subsys.h
+++ b/hw/block/nvme-subsys.h
@@ -30,6 +30,16 @@ typedef struct NvmeSubsystem {
 int nvme_subsys_register_ctrl(NvmeCtrl *n, Error **errp);
 int nvme_subsys_register_ns(NvmeNamespace *ns, Error **errp);
 
+static inline NvmeCtrl *nvme_subsys_ctrl(NvmeSubsystem *subsys,
+uint32_t cntlid)
+{
+if (!subsys) {
+return NULL;
+}
+
+return subsys->ctrls[cntlid];
+}
+
 /*
  * Return allocated namespace of the specified nsid in the subsystem.
  */
diff --git a/hw/block/nvme.c b/hw/block/nvme.c
index 697368a6ae0c..71bcd66f1956 100644
--- a/hw/block/nvme.c
+++ b/hw/block/nvme.c
@@ -183,6 +183,7 @@ static const uint32_t nvme_cse_acs[256] = {
 [NVME_ADM_CMD_SET_FEATURES] = NVME_CMD_EFF_CSUPP,
 [NVME_ADM_CMD_GET_FEATURES] = NVME_CMD_EFF_CSUPP,
 [NVME_ADM_CMD_ASYNC_EV_REQ] = NVME_CMD_EFF_CSUPP,
+[NVME_ADM_CMD_NS_ATTACHMENT]= NVME_CMD_EFF_CSUPP,
 };
 
 static const uint32_t nvme_cse_iocs_none[256];
@@ -3766,6 +3767,62 @@ static uint16_t nvme_aer(NvmeCtrl *n, NvmeRequest *req)
 return NVME_NO_COMPLETE;
 }
 
+static void __nvme_select_ns_iocs(NvmeCtrl *n, NvmeNamespace *ns);
+static uint16_t nvme_ns_attachment(NvmeCtrl *n, NvmeRequest *req)
+{
+NvmeNamespace *ns;
+NvmeCtrl *ctrl;
+uint16_t list[NVME_CONTROLLER_LIST_SIZE] = {};
+uint32_t nsid = le32_to_cpu(req->cmd.nsid);
+uint32_t dw10 = le32_to_cpu(req->cmd.cdw10);
+bool attach = !(dw10 & 0xf);
+uint16_t *nr_ids = [0];
+uint16_t *ids = [1];
+uint16_t ret;
+int i;
+
+trace_pci_nvme_ns_attachment(nvme_cid(req), dw10 & 0xf);
+
+ns = nvme_subsys_ns(n->subsys, nsid);
+if (!ns) {
+return NVME_INVALID_FIELD | NVME_DNR;
+}
+
+ret = nvme_dma(n, (uint8_t *)list, 4096,
+   DMA_DIRECTION_TO_DEVICE, req);
+if (ret) {
+return ret;
+}
+
+if (!*nr_ids) {
+return NVME_NS_CTRL_LIST_INVALID | NVME_DNR;
+}
+
+for (i = 0; i < *nr_ids; i++) {
+ctrl = nvme_subsys_ctrl(n->subsys, ids[i]);
+if (!ctrl) {
+return NVME_NS_CTRL_LIST_INVALID | NVME_DNR;
+}
+
+if (attach) {
+if (nvme_ns_is_attached(ctrl, ns)) {
+return NVME_NS_ALREADY_ATTACHED | NVME_DNR;
+}
+
+nvme_ns_attach(ctrl, ns);
+__nvme_select_ns_iocs(ctrl, ns);
+} else {
+if (!nvme_ns_is_attached(ctrl, ns)) {
+return NVME_NS_NOT_ATTACHED | NVME_DNR;
+}
+
+nvme_ns_detach(ctrl, ns);
+}
+}
+
+return NVME_SUCCESS;
+}
+
 static uint16_t nvme_admin_cmd(NvmeCtrl *n, NvmeRequest *req)
 {
 trace_pci_nvme_admin_cmd(nvme_cid(req), nvme_sqid(req), req->cmd.opcode,
@@ -3797,6 +3854,8 @@ static uint16_t nvme_admin_cmd(NvmeCtrl *n, NvmeRequest 
*req)
 return nvme_get_feature(n, req);
 case NVME_ADM_CMD_ASYNC_EV_REQ:
 return nvme_aer(n, req);
+case NVME_ADM_CMD_NS_ATTACHMENT:
+return nvme_ns_attachment(n, req);
 default:
 assert(false);
 }
diff --git a/hw/block/nvme.h b/hw/block/nvme.h
index 1c7796b20996..5a1ab857d166 100644
--- a/hw/block/nvme.h
+++ b/hw/block/nvme.h
@@ -222,6 +222,11 @@ static inline void nvme_ns_attach(NvmeCtrl *n, 
NvmeNamespace *ns)
 n->namespaces[nvme_nsid(ns) - 1] = ns;
 }
 
+static inline void nvme_ns_detach(NvmeCtrl *n, NvmeNamespace *ns)
+{
+n->namespaces[nvme_nsid(ns) - 1] = NULL;
+}
+
 static inline NvmeCQueue *nvme_cq(NvmeRequest *req)
 {
 NvmeSQueue *sq = req->sq;
diff --git a/hw/block/trace-events b/hw/block/trace-events
index b6e972d733a6..bf67fe7873d2 100644
--- a/hw/block/trace-events
+++ b/hw/block/trace-events
@@ -80,6 +80,8 @@ pci_nvme_aer(uint16_t cid) "cid %"PRIu16""
 pci_nvme_aer_aerl_exceeded(void) "aerl exceeded"
 pci_nvme_aer_masked(uint8_t type, uint8_t mask) "type 0x%"PRIx8" mask 
0x%"PRIx8""
 pci_nvme_aer_post_cqe(uint8_t typ, uint8_t info, uint8_t log_page) "type 
0x%"PRIx8" info 0x%"PRIx8" lid 0x%"PRIx8""
+pci_nvme_ns_attachment(uint16_t cid, uint8_t sel) "cid %"PRIu16", 
sel=0x%"PRIx8""
+pci_nvme_ns_attachm

[PATCH V2 3/7] hw/block/nvme: fix allocated namespace list to 256

2021-02-10 Thread Minwoo Im

Expand allocated namespace list (subsys->namespaces) to have 256 entries
which is a value lager than at least NVME_MAX_NAMESPACES which is for
attached namespace list in a controller.

Allocated namespace list should at least larger than attached namespace
list.

n->num_namespaces = NVME_MAX_NAMESPACES;

The above line will set the NN field by id->nn so that the subsystem
should also prepare at least this number of namespace list entries.

Signed-off-by: Minwoo Im 
---
 hw/block/nvme-subsys.h | 2 +-
 hw/block/nvme.h| 6 ++
 2 files changed, 7 insertions(+), 1 deletion(-)

diff --git a/hw/block/nvme-subsys.h b/hw/block/nvme-subsys.h
index 574774390c4c..8a0732b22316 100644
--- a/hw/block/nvme-subsys.h
+++ b/hw/block/nvme-subsys.h
@@ -14,7 +14,7 @@
 OBJECT_CHECK(NvmeSubsystem, (obj), TYPE_NVME_SUBSYS)
 
 #define NVME_SUBSYS_MAX_CTRLS   32
-#define NVME_SUBSYS_MAX_NAMESPACES  32
+#define NVME_SUBSYS_MAX_NAMESPACES  256
 
 typedef struct NvmeCtrl NvmeCtrl;
 typedef struct NvmeNamespace NvmeNamespace;
diff --git a/hw/block/nvme.h b/hw/block/nvme.h
index bde0ed7c2679..1c7796b20996 100644
--- a/hw/block/nvme.h
+++ b/hw/block/nvme.h
@@ -10,6 +10,12 @@
 #define NVME_DEFAULT_ZONE_SIZE   (128 * MiB)
 #define NVME_DEFAULT_MAX_ZA_SIZE (128 * KiB)
 
+/*
+ * Subsystem namespace list for allocated namespaces should be larger than
+ * attached namespace list in a controller.
+ */
+QEMU_BUILD_BUG_ON(NVME_MAX_NAMESPACES > NVME_SUBSYS_MAX_NAMESPACES);
+
 typedef struct NvmeParams {
 char *serial;
 uint32_t num_queues; /* deprecated since 5.1 */
-- 
2.17.1

[PATCH V2 5/7] hw/block/nvme: refactor nvme_select_ns_iocs

2021-02-10 Thread Minwoo Im

This patch has no functional changes.  This patch just refactored
nvme_select_ns_iocs() to iterate the attached namespaces of the
controlller and make it invoke __nvme_select_ns_iocs().

Signed-off-by: Minwoo Im 
---
 hw/block/nvme.c | 36 +---
 1 file changed, 21 insertions(+), 15 deletions(-)

diff --git a/hw/block/nvme.c b/hw/block/nvme.c
index d1761a82731f..697368a6ae0c 100644
--- a/hw/block/nvme.c
+++ b/hw/block/nvme.c
@@ -3896,6 +3896,25 @@ static void nvme_ctrl_shutdown(NvmeCtrl *n)
 }
 }
 
+static void __nvme_select_ns_iocs(NvmeCtrl *n, NvmeNamespace *ns)
+{
+ns->iocs = nvme_cse_iocs_none;
+switch (ns->csi) {
+case NVME_CSI_NVM:
+if (NVME_CC_CSS(n->bar.cc) != NVME_CC_CSS_ADMIN_ONLY) {
+ns->iocs = nvme_cse_iocs_nvm;
+}
+break;
+case NVME_CSI_ZONED:
+if (NVME_CC_CSS(n->bar.cc) == NVME_CC_CSS_CSI) {
+ns->iocs = nvme_cse_iocs_zoned;
+} else if (NVME_CC_CSS(n->bar.cc) == NVME_CC_CSS_NVM) {
+ns->iocs = nvme_cse_iocs_nvm;
+}
+break;
+}
+}
+
 static void nvme_select_ns_iocs(NvmeCtrl *n)
 {
 NvmeNamespace *ns;
@@ -3906,21 +3925,8 @@ static void nvme_select_ns_iocs(NvmeCtrl *n)
 if (!ns) {
 continue;
 }
-ns->iocs = nvme_cse_iocs_none;
-switch (ns->csi) {
-case NVME_CSI_NVM:
-if (NVME_CC_CSS(n->bar.cc) != NVME_CC_CSS_ADMIN_ONLY) {
-ns->iocs = nvme_cse_iocs_nvm;
-}
-break;
-case NVME_CSI_ZONED:
-if (NVME_CC_CSS(n->bar.cc) == NVME_CC_CSS_CSI) {
-ns->iocs = nvme_cse_iocs_zoned;
-} else if (NVME_CC_CSS(n->bar.cc) == NVME_CC_CSS_NVM) {
-ns->iocs = nvme_cse_iocs_nvm;
-}
-break;
-}
+
+__nvme_select_ns_iocs(n, ns);
 }
 }
 
-- 
2.17.1

[PATCH V2 2/7] hw/block/nvme: fix namespaces array to 1-based

2021-02-10 Thread Minwoo Im

subsys->namespaces array used to be sized to NVME_SUBSYS_MAX_NAMESPACES.
But subsys->namespaces are being accessed with 1-based namespace id
which means the very first array entry will always be empty(NULL).

Signed-off-by: Minwoo Im 
---
 hw/block/nvme-subsys.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/hw/block/nvme-subsys.h b/hw/block/nvme-subsys.h
index 890d118117dc..574774390c4c 100644
--- a/hw/block/nvme-subsys.h
+++ b/hw/block/nvme-subsys.h
@@ -24,7 +24,7 @@ typedef struct NvmeSubsystem {
 
 NvmeCtrl*ctrls[NVME_SUBSYS_MAX_CTRLS];
 /* Allocated namespaces for this subsystem */
-NvmeNamespace *namespaces[NVME_SUBSYS_MAX_NAMESPACES];
+NvmeNamespace *namespaces[NVME_SUBSYS_MAX_NAMESPACES + 1];
 } NvmeSubsystem;
 
 int nvme_subsys_register_ctrl(NvmeCtrl *n, Error **errp);
-- 
2.17.1

[PATCH V2 7/7] hw/block/nvme: support Identify NS Attached Controller List

2021-02-10 Thread Minwoo Im

Support Identify command for Namespace attached controller list.  This
command handler will traverse the controller instances in the given
subsystem to figure out whether the specified nsid is attached to the
controllers or not.

The 4096bytes Identify data will return with the first entry (16bits)
indicating the number of the controller id entries.  So, the data can
hold up to 2047 entries for the controller ids.

Signed-off-by: Minwoo Im 
---
 hw/block/nvme.c   | 42 ++
 hw/block/trace-events |  1 +
 include/block/nvme.h  |  1 +
 3 files changed, 44 insertions(+)

diff --git a/hw/block/nvme.c b/hw/block/nvme.c
index 71bcd66f1956..da60335def9f 100644
--- a/hw/block/nvme.c
+++ b/hw/block/nvme.c
@@ -3157,6 +3157,46 @@ static uint16_t nvme_identify_ns(NvmeCtrl *n, 
NvmeRequest *req, bool active)
 return NVME_INVALID_CMD_SET | NVME_DNR;
 }
 
+static uint16_t nvme_identify_ns_attached_list(NvmeCtrl *n, NvmeRequest *req)
+{
+NvmeIdentify *c = (NvmeIdentify *)>cmd;
+uint16_t min_id = le16_to_cpu(c->ctrlid);
+uint16_t list[NVME_CONTROLLER_LIST_SIZE] = {};
+uint16_t *ids = [1];
+NvmeNamespace *ns;
+NvmeCtrl *ctrl;
+int cntlid, nr_ids = 0;
+
+trace_pci_nvme_identify_ns_attached_list(min_id);
+
+if (c->nsid == NVME_NSID_BROADCAST) {
+return NVME_INVALID_FIELD | NVME_DNR;
+}
+
+ns = nvme_subsys_ns(n->subsys, c->nsid);
+if (!ns) {
+return NVME_INVALID_FIELD | NVME_DNR;
+}
+
+for (cntlid = min_id; cntlid < ARRAY_SIZE(n->subsys->ctrls); cntlid++) {
+ctrl = nvme_subsys_ctrl(n->subsys, cntlid);
+if (!ctrl) {
+continue;
+}
+
+if (!nvme_ns_is_attached(ctrl, ns)) {
+continue;
+}
+
+ids[nr_ids++] = cntlid;
+}
+
+list[0] = nr_ids;
+
+return nvme_dma(n, (uint8_t *)list, sizeof(list),
+DMA_DIRECTION_FROM_DEVICE, req);
+}
+
 static uint16_t nvme_identify_ns_csi(NvmeCtrl *n, NvmeRequest *req,
 bool active)
 {
@@ -3356,6 +3396,8 @@ static uint16_t nvme_identify(NvmeCtrl *n, NvmeRequest 
*req)
 return nvme_identify_ns(n, req, true);
 case NVME_ID_CNS_NS_PRESENT:
 return nvme_identify_ns(n, req, false);
+case NVME_ID_CNS_NS_ATTACHED_CTRL_LIST:
+return nvme_identify_ns_attached_list(n, req);
 case NVME_ID_CNS_CS_NS:
 return nvme_identify_ns_csi(n, req, true);
 case NVME_ID_CNS_CS_NS_PRESENT:
diff --git a/hw/block/trace-events b/hw/block/trace-events
index bf67fe7873d2..2d88d96c2165 100644
--- a/hw/block/trace-events
+++ b/hw/block/trace-events
@@ -62,6 +62,7 @@ pci_nvme_del_cq(uint16_t cqid) "deleted completion queue, 
cqid=%"PRIu16""
 pci_nvme_identify_ctrl(void) "identify controller"
 pci_nvme_identify_ctrl_csi(uint8_t csi) "identify controller, csi=0x%"PRIx8""
 pci_nvme_identify_ns(uint32_t ns) "nsid %"PRIu32""
+pci_nvme_identify_ns_attached_list(uint16_t cntid) "cntid=%"PRIu16""
 pci_nvme_identify_ns_csi(uint32_t ns, uint8_t csi) "nsid=%"PRIu32", 
csi=0x%"PRIx8""
 pci_nvme_identify_nslist(uint32_t ns) "nsid %"PRIu32""
 pci_nvme_identify_nslist_csi(uint16_t ns, uint8_t csi) "nsid=%"PRIu16", 
csi=0x%"PRIx8""
diff --git a/include/block/nvme.h b/include/block/nvme.h
index 4b016f954fee..fb82d8682e9f 100644
--- a/include/block/nvme.h
+++ b/include/block/nvme.h
@@ -968,6 +968,7 @@ enum NvmeIdCns {
 NVME_ID_CNS_CS_NS_ACTIVE_LIST = 0x07,
 NVME_ID_CNS_NS_PRESENT_LIST   = 0x10,
 NVME_ID_CNS_NS_PRESENT= 0x11,
+NVME_ID_CNS_NS_ATTACHED_CTRL_LIST = 0x12,
 NVME_ID_CNS_CS_NS_PRESENT_LIST= 0x1a,
 NVME_ID_CNS_CS_NS_PRESENT = 0x1b,
 NVME_ID_CNS_IO_COMMAND_SET= 0x1c,
-- 
2.17.1

[PATCH V2 4/7] hw/block/nvme: support allocated namespace type

2021-02-10 Thread Minwoo Im

>From NVMe spec 1.4b "6.1.5. NSID and Namespace Relationships" defines
valid namespace types:

- Unallocated: Not exists in the NVMe subsystem
- Allocated: Exists in the NVMe subsystem
- Inactive: Not attached to the controller
- Active: Attached to the controller

This patch added support for allocated, but not attached namespace type:

!nvme_ns(n, nsid) && nvme_subsys_ns(n->subsys, nsid)

nvme_ns() returns attached namespace instance of the given controller
and nvme_subsys_ns() returns allocated namespace instance in the
subsystem.

Signed-off-by: Minwoo Im 
---
 hw/block/nvme-subsys.h | 13 +
 hw/block/nvme.c| 63 +++---
 2 files changed, 60 insertions(+), 16 deletions(-)

diff --git a/hw/block/nvme-subsys.h b/hw/block/nvme-subsys.h
index 8a0732b22316..14627f9ccb41 100644
--- a/hw/block/nvme-subsys.h
+++ b/hw/block/nvme-subsys.h
@@ -30,4 +30,17 @@ typedef struct NvmeSubsystem {
 int nvme_subsys_register_ctrl(NvmeCtrl *n, Error **errp);
 int nvme_subsys_register_ns(NvmeNamespace *ns, Error **errp);
 
+/*
+ * Return allocated namespace of the specified nsid in the subsystem.
+ */
+static inline NvmeNamespace *nvme_subsys_ns(NvmeSubsystem *subsys,
+uint32_t nsid)
+{
+if (!subsys) {
+return NULL;
+}
+
+return subsys->namespaces[nsid];
+}
+
 #endif /* NVME_SUBSYS_H */
diff --git a/hw/block/nvme.c b/hw/block/nvme.c
index a1e930f7c8e4..d1761a82731f 100644
--- a/hw/block/nvme.c
+++ b/hw/block/nvme.c
@@ -3124,7 +3124,7 @@ static uint16_t nvme_identify_ctrl_csi(NvmeCtrl *n, 
NvmeRequest *req)
 return NVME_INVALID_FIELD | NVME_DNR;
 }
 
-static uint16_t nvme_identify_ns(NvmeCtrl *n, NvmeRequest *req)
+static uint16_t nvme_identify_ns(NvmeCtrl *n, NvmeRequest *req, bool active)
 {
 NvmeNamespace *ns;
 NvmeIdentify *c = (NvmeIdentify *)>cmd;
@@ -3138,7 +3138,14 @@ static uint16_t nvme_identify_ns(NvmeCtrl *n, 
NvmeRequest *req)
 
 ns = nvme_ns(n, nsid);
 if (unlikely(!ns)) {
-return nvme_rpt_empty_id_struct(n, req);
+if (!active) {
+ns = nvme_subsys_ns(n->subsys, nsid);
+if (!ns) {
+return nvme_rpt_empty_id_struct(n, req);
+}
+} else {
+return nvme_rpt_empty_id_struct(n, req);
+}
 }
 
 if (c->csi == NVME_CSI_NVM && nvme_csi_has_nvm_support(ns)) {
@@ -3149,7 +3156,8 @@ static uint16_t nvme_identify_ns(NvmeCtrl *n, NvmeRequest 
*req)
 return NVME_INVALID_CMD_SET | NVME_DNR;
 }
 
-static uint16_t nvme_identify_ns_csi(NvmeCtrl *n, NvmeRequest *req)
+static uint16_t nvme_identify_ns_csi(NvmeCtrl *n, NvmeRequest *req,
+bool active)
 {
 NvmeNamespace *ns;
 NvmeIdentify *c = (NvmeIdentify *)>cmd;
@@ -3163,7 +3171,14 @@ static uint16_t nvme_identify_ns_csi(NvmeCtrl *n, 
NvmeRequest *req)
 
 ns = nvme_ns(n, nsid);
 if (unlikely(!ns)) {
-return nvme_rpt_empty_id_struct(n, req);
+if (!active) {
+ns = nvme_subsys_ns(n->subsys, nsid);
+if (!ns) {
+return nvme_rpt_empty_id_struct(n, req);
+}
+} else {
+return nvme_rpt_empty_id_struct(n, req);
+}
 }
 
 if (c->csi == NVME_CSI_NVM && nvme_csi_has_nvm_support(ns)) {
@@ -3176,7 +3191,8 @@ static uint16_t nvme_identify_ns_csi(NvmeCtrl *n, 
NvmeRequest *req)
 return NVME_INVALID_FIELD | NVME_DNR;
 }
 
-static uint16_t nvme_identify_nslist(NvmeCtrl *n, NvmeRequest *req)
+static uint16_t nvme_identify_nslist(NvmeCtrl *n, NvmeRequest *req,
+bool active)
 {
 NvmeNamespace *ns;
 NvmeIdentify *c = (NvmeIdentify *)>cmd;
@@ -3201,7 +3217,14 @@ static uint16_t nvme_identify_nslist(NvmeCtrl *n, 
NvmeRequest *req)
 for (i = 1; i <= n->num_namespaces; i++) {
 ns = nvme_ns(n, i);
 if (!ns) {
-continue;
+if (!active) {
+ns = nvme_subsys_ns(n->subsys, i);
+if (!ns) {
+continue;
+}
+} else {
+continue;
+}
 }
 if (ns->params.nsid <= min_nsid) {
 continue;
@@ -3215,7 +3238,8 @@ static uint16_t nvme_identify_nslist(NvmeCtrl *n, 
NvmeRequest *req)
 return nvme_dma(n, list, data_len, DMA_DIRECTION_FROM_DEVICE, req);
 }
 
-static uint16_t nvme_identify_nslist_csi(NvmeCtrl *n, NvmeRequest *req)
+static uint16_t nvme_identify_nslist_csi(NvmeCtrl *n, NvmeRequest *req,
+bool active)
 {
 NvmeNamespace *ns;
 NvmeIdentify *c = (NvmeIdentify *)>cmd;
@@ -3241,7 +3265,14 @@ static uint16_t nvme_identify_nslist_csi(NvmeCtrl *n, 
NvmeRequest *req)
 for (i = 1; i <= n->num_namespaces; i++) {
 ns = nvme_ns(n, i);
 if (!ns) {
-continue;
+if (!active) {
+ns =

[PATCH V2 1/7] hw/block/nvme: support namespace detach

2021-02-10 Thread Minwoo Im

Given that now we have nvme-subsys device supported, we can manage
namespace allocated, but not attached: detached.  This patch introduced
a parameter for nvme-ns device named 'detached'.  This parameter
indicates whether the given namespace device is detached from
a entire NVMe subsystem('subsys' given case, shared namespace) or a
controller('bus' given case, private namespace).

- Allocated namespace

  1) Shared ns in the subsystem 'subsys0':

 -device nvme-ns,id=ns1,drive=blknvme0,nsid=1,subsys=subsys0,detached=true

  2) Private ns for the controller 'nvme0' of the subsystem 'subsys0':

 -device nvme-subsys,id=subsys0
 -device nvme,serial=foo,id=nvme0,subsys=subsys0
 -device nvme-ns,id=ns1,drive=blknvme0,nsid=1,bus=nvme0,detached=true

  3) (Invalid case) Controller 'nvme0' has no subsystem to manage ns:

 -device nvme,serial=foo,id=nvme0
 -device nvme-ns,id=ns1,drive=blknvme0,nsid=1,bus=nvme0,detached=true

Signed-off-by: Minwoo Im 
---
 hw/block/nvme-ns.c |  1 +
 hw/block/nvme-ns.h |  1 +
 hw/block/nvme-subsys.h |  1 +
 hw/block/nvme.c| 41 +++--
 hw/block/nvme.h| 22 ++
 5 files changed, 64 insertions(+), 2 deletions(-)

diff --git a/hw/block/nvme-ns.c b/hw/block/nvme-ns.c
index c3b513b0fc78..cdcb81319fb5 100644
--- a/hw/block/nvme-ns.c
+++ b/hw/block/nvme-ns.c
@@ -393,6 +393,7 @@ static Property nvme_ns_props[] = {
 DEFINE_BLOCK_PROPERTIES(NvmeNamespace, blkconf),
 DEFINE_PROP_LINK("subsys", NvmeNamespace, subsys, TYPE_NVME_SUBSYS,
  NvmeSubsystem *),
+DEFINE_PROP_BOOL("detached", NvmeNamespace, params.detached, false),
 DEFINE_PROP_UINT32("nsid", NvmeNamespace, params.nsid, 0),
 DEFINE_PROP_UUID("uuid", NvmeNamespace, params.uuid),
 DEFINE_PROP_UINT16("mssrl", NvmeNamespace, params.mssrl, 128),
diff --git a/hw/block/nvme-ns.h b/hw/block/nvme-ns.h
index 7af6884862b5..b0c00e115d81 100644
--- a/hw/block/nvme-ns.h
+++ b/hw/block/nvme-ns.h
@@ -26,6 +26,7 @@ typedef struct NvmeZone {
 } NvmeZone;
 
 typedef struct NvmeNamespaceParams {
+bool detached;
 uint32_t nsid;
 QemuUUID uuid;
 
diff --git a/hw/block/nvme-subsys.h b/hw/block/nvme-subsys.h
index ccf6a71398d3..890d118117dc 100644
--- a/hw/block/nvme-subsys.h
+++ b/hw/block/nvme-subsys.h
@@ -23,6 +23,7 @@ typedef struct NvmeSubsystem {
 uint8_t subnqn[256];
 
 NvmeCtrl*ctrls[NVME_SUBSYS_MAX_CTRLS];
+/* Allocated namespaces for this subsystem */
 NvmeNamespace *namespaces[NVME_SUBSYS_MAX_NAMESPACES];
 } NvmeSubsystem;
 
diff --git a/hw/block/nvme.c b/hw/block/nvme.c
index 6b84e34843f5..a1e930f7c8e4 100644
--- a/hw/block/nvme.c
+++ b/hw/block/nvme.c
@@ -23,7 +23,7 @@
  *  max_ioqpairs=, \
  *  aerl=, aer_max_queued=, \
  *  mdts=,zoned.append_size_limit=, \
- *  subsys= \
+ *  subsys=,detached=
  *  -device nvme-ns,drive=,bus=,nsid=,\
  *  zoned=, \
  *  subsys=
@@ -78,6 +78,13 @@
  *   controllers in the subsystem. Otherwise, `bus` must be given to attach
  *   this namespace to a specified single controller as a non-shared namespace.
  *
+ * - `detached`
+ *   Not to attach the namespace device to controllers in the NVMe subsystem
+ *   during boot-up. If not given, namespaces are all attahced to all
+ *   controllers in the subsystem by default.
+ *   It's mutual exclusive with 'bus' parameter. It's only valid in case
+ *   `subsys` is provided.
+ *
  * Setting `zoned` to true selects Zoned Command Set at the namespace.
  * In this case, the following namespace properties are available to configure
  * zoned operation:
@@ -4521,6 +4528,20 @@ static void nvme_init_state(NvmeCtrl *n)
 n->aer_reqs = g_new0(NvmeRequest *, n->params.aerl + 1);
 }
 
+static int nvme_attach_namespace(NvmeCtrl *n, NvmeNamespace *ns, Error **errp)
+{
+if (nvme_ns_is_attached(n, ns)) {
+error_setg(errp,
+   "namespace %d is already attached to controller %d",
+   nvme_nsid(ns), n->cntlid);
+return -1;
+}
+
+nvme_ns_attach(n, ns);
+
+return 0;
+}
+
 int nvme_register_namespace(NvmeCtrl *n, NvmeNamespace *ns, Error **errp)
 {
 uint32_t nsid = nvme_nsid(ns);
@@ -4552,7 +4573,23 @@ int nvme_register_namespace(NvmeCtrl *n, NvmeNamespace 
*ns, Error **errp)
 
 trace_pci_nvme_register_namespace(nsid);
 
-n->namespaces[nsid - 1] = ns;
+/*
+ * If subsys is not given, namespae is always attached to the controller
+ * because there's no subsystem to manage namespace allocation.
+ */
+if (!n->subsys) {
+if (ns->params.detached) {
+error_setg(errp,
+   "detached needs nvme-subsys specified nvme or nvme-ns");
+return -1;
+}
+
+return nvme_attach_namesp

[PATCH V2 0/6] hw/block/nvme: support namespace attachment

2021-02-10 Thread Minwoo Im

  FW Rev  
  -  
 - -- 
 
  /dev/nvme0n1  foo  QEMU NVMe Ctrl 
  1 268.44  MB / 268.44  MB512   B +  0 B   1.0 
  /dev/nvme0n2  foo  QEMU NVMe Ctrl 
  2 268.44  MB / 268.44  MB512   B +  0 B   1.0 
  /dev/nvme1n2  bar  QEMU NVMe Ctrl 
  2 268.44  MB / 268.44  MB512   B +  0 B   1.0 
  /dev/nvme1n3  bar  QEMU NVMe Ctrl 
  3 268.44  MB / 268.44  MB512   B +  0 B   1.0 
  root@vm:~/work# nvme attach-ns /dev/nvme0 --namespace-id=1 --controllers=1
  attach-ns: Success, nsid:1
  root@vm:~/work# echo 1 > /sys/class/nvme/nvme1/rescan_controller 
  root@vm:~/work# nvme list
  Node  SN   Model  
  Namespace Usage  Format   FW Rev  
  -  
 - -- 
 
  /dev/nvme0n1  foo  QEMU NVMe Ctrl 
  1 268.44  MB / 268.44  MB512   B +  0 B   1.0 
  /dev/nvme0n2  foo  QEMU NVMe Ctrl 
  2 268.44  MB / 268.44  MB512   B +  0 B   1.0 
  /dev/nvme1n1  bar  QEMU NVMe Ctrl 
  1 268.44  MB / 268.44  MB512   B +  0 B   1.0 
  /dev/nvme1n2  bar  QEMU NVMe Ctrl 
  2 268.44  MB / 268.44  MB512   B +  0 B   1.0 
  /dev/nvme1n3  bar  QEMU NVMe Ctrl 
  3 268.44  MB / 268.44  MB512   B +  0 B   1.0 

Minwoo Im (7):
  hw/block/nvme: support namespace detach
  hw/block/nvme: fix namespaces array to 1-based
  hw/block/nvme: fix allocated namespace list to 256
  hw/block/nvme: support allocated namespace type
  hw/block/nvme: refactor nvme_select_ns_iocs
  hw/block/nvme: support namespace attachment command
  hw/block/nvme: support Identify NS Attached Controller List

 hw/block/nvme-ns.c |   1 +
 hw/block/nvme-ns.h |   1 +
 hw/block/nvme-subsys.h |  28 -
 hw/block/nvme.c| 241 +++--
 hw/block/nvme.h|  33 ++
 hw/block/trace-events  |   3 +
 include/block/nvme.h   |   6 +
 7 files changed, 278 insertions(+), 35 deletions(-)

-- 
2.17.1

Re: [PATCH 2/2] hw/block/nvme: add write uncorrectable command

2021-02-10 Thread Minwoo Im

> > It might be nitpick, 'nlb' would easily represent the value which is
> > defined itself in the spec which is zero-based.  Can we have this like:
> > 
> > uint32_t nlb = le16_to_cpu(rw->nlb);
> > 
> > bitmap_clear(ns->uncorrectable, slba, nlb + 1);
> > 
> 
> 
> I do not disagree, but the `uint32_t nlb = le16_to_cpu(rw->nlb) + 1;`
> pattern is already used in several places.

Oh yes, Now I just saw some places.  Then, please take my review tag for
this patch.

Thanks!

Re: [PATCH 2/2] hw/block/nvme: add write uncorrectable command

2021-02-10 Thread Minwoo Im

On 21-02-10 08:06:46, Klaus Jensen wrote:
> From: Gollu Appalanaidu 
> 
> Add support for marking blocks invalid with the Write Uncorrectable
> command. Block status is tracked in a (non-persistent) bitmap that is
> checked on all reads and written to on all writes. This is potentially
> expensive, so keep Write Uncorrectable disabled by default.
> 
> Signed-off-by: Gollu Appalanaidu 
> Signed-off-by: Klaus Jensen 
> ---
>  docs/specs/nvme.txt   |  3 ++
>  hw/block/nvme-ns.h|  2 ++
>  hw/block/nvme.h   |  1 +
>  hw/block/nvme-ns.c|  2 ++
>  hw/block/nvme.c   | 65 +--
>  hw/block/trace-events |  1 +
>  6 files changed, 66 insertions(+), 8 deletions(-)
> 
> diff --git a/docs/specs/nvme.txt b/docs/specs/nvme.txt
> index 56d393884e7a..88f9cc278d4c 100644
> --- a/docs/specs/nvme.txt
> +++ b/docs/specs/nvme.txt
> @@ -19,5 +19,8 @@ Known issues
>  
>  * The accounting numbers in the SMART/Health are reset across power cycles
>  
> +* Marking blocks invalid with the Write Uncorrectable is not persisted across
> +  power cycles.
> +
>  * Interrupt Coalescing is not supported and is disabled by default in 
> volation
>of the specification.
> diff --git a/hw/block/nvme-ns.h b/hw/block/nvme-ns.h
> index 7af6884862b5..15fa422ded03 100644
> --- a/hw/block/nvme-ns.h
> +++ b/hw/block/nvme-ns.h
> @@ -72,6 +72,8 @@ typedef struct NvmeNamespace {
>  struct {
>  uint32_t err_rec;
>  } features;
> +
> +unsigned long *uncorrectable;
>  } NvmeNamespace;
>  
>  static inline uint32_t nvme_nsid(NvmeNamespace *ns)
> diff --git a/hw/block/nvme.h b/hw/block/nvme.h
> index 98082b2dfba3..9b8f85b9cf16 100644
> --- a/hw/block/nvme.h
> +++ b/hw/block/nvme.h
> @@ -68,6 +68,7 @@ static inline const char *nvme_io_opc_str(uint8_t opc)
>  case NVME_CMD_FLUSH:return "NVME_NVM_CMD_FLUSH";
>  case NVME_CMD_WRITE:return "NVME_NVM_CMD_WRITE";
>  case NVME_CMD_READ: return "NVME_NVM_CMD_READ";
> +case NVME_CMD_WRITE_UNCOR:  return "NVME_CMD_WRITE_UNCOR";
>  case NVME_CMD_COMPARE:  return "NVME_NVM_CMD_COMPARE";
>  case NVME_CMD_WRITE_ZEROES: return "NVME_NVM_CMD_WRITE_ZEROES";
>  case NVME_CMD_DSM:  return "NVME_NVM_CMD_DSM";
> diff --git a/hw/block/nvme-ns.c b/hw/block/nvme-ns.c
> index ade46e2f3739..742bbc4b4b62 100644
> --- a/hw/block/nvme-ns.c
> +++ b/hw/block/nvme-ns.c
> @@ -72,6 +72,8 @@ static int nvme_ns_init(NvmeNamespace *ns, Error **errp)
>  id_ns->mcl = cpu_to_le32(ns->params.mcl);
>  id_ns->msrc = ns->params.msrc;
>  
> +ns->uncorrectable = bitmap_new(id_ns->nsze);
> +
>  return 0;
>  }
>  
> diff --git a/hw/block/nvme.c b/hw/block/nvme.c
> index e5f725d7..56048046c193 100644
> --- a/hw/block/nvme.c
> +++ b/hw/block/nvme.c
> @@ -1112,6 +1112,20 @@ static uint16_t nvme_check_dulbe(NvmeNamespace *ns, 
> uint64_t slba,
>  return NVME_SUCCESS;
>  }
>  
> +static inline uint16_t nvme_check_uncor(NvmeNamespace *ns, uint64_t slba,
> +uint32_t nlb)
> +{
> +uint64_t elba = nlb + slba;
> +
> +if (ns->uncorrectable) {
> +if (find_next_bit(ns->uncorrectable, elba, slba) < elba) {
> +return NVME_UNRECOVERED_READ | NVME_DNR;
> +}
> +}
> +
> +return NVME_SUCCESS;
> +}
> +
>  static void nvme_aio_err(NvmeRequest *req, int ret)
>  {
>  uint16_t status = NVME_SUCCESS;
> @@ -1423,14 +1437,24 @@ static void nvme_rw_cb(void *opaque, int ret)
>  BlockAcctCookie *acct = >acct;
>  BlockAcctStats *stats = blk_get_stats(blk);
>  
> +bool is_write = nvme_is_write(req);
> +
>  trace_pci_nvme_rw_cb(nvme_cid(req), blk_name(blk));
>  
> -if (ns->params.zoned && nvme_is_write(req)) {
> +if (ns->params.zoned && is_write) {
>  nvme_finalize_zoned_write(ns, req);
>  }
>  
>  if (!ret) {
>  block_acct_done(stats, acct);
> +
> +if (is_write) {
> +NvmeRwCmd *rw = (NvmeRwCmd *)>cmd;
> +    uint64_t slba = le64_to_cpu(rw->slba);
> +uint32_t nlb = le16_to_cpu(rw->nlb) + 1;
> +
> +bitmap_clear(ns->uncorrectable, slba, nlb);

It might be nitpick, 'nlb' would easily represent the value which is
defined itself in the spec which is zero-based.  Can we have this like:

uint32_t nlb = le16_to_cpu(rw->nlb);

bitmap_clear(ns->uncorrectable, slba, nlb + 1);

Otherwise, it looks good to me.

Reviewed-by: Minwoo Im

Re: [PATCH 1/2] hw/block/nvme: add oncs device parameter

2021-02-10 Thread Minwoo Im

On 21-02-10 08:06:45, Klaus Jensen wrote:
> From: Gollu Appalanaidu 
> 
> Add the 'oncs' nvme device parameter to allow optional features to be
> enabled/disabled explicitly. Since most of these are optional commands,
> make the CSE log pages dynamic to account for the value of ONCS.
> 
> Signed-off-by: Gollu Appalanaidu 
> Signed-off-by: Klaus Jensen 

Reviewed-by: Minwoo Im

1 2 3 >

1 - 100 of 235 matches

Mail list logo