Re: [PATCH v5 21/26] nvme: add support for scatter gather lists
On Mon, 2020-03-16 at 00:54 -0700, Klaus Birkelund Jensen wrote: > On Feb 12 14:07, Maxim Levitsky wrote: > > On Tue, 2020-02-04 at 10:52 +0100, Klaus Jensen wrote: > > > For now, support the Data Block, Segment and Last Segment descriptor > > > types. > > > > > > See NVM Express 1.3d, Section 4.4 ("Scatter Gather List (SGL)"). > > > > > > Signed-off-by: Klaus Jensen > > > Acked-by: Fam Zheng > > > --- > > > block/nvme.c | 18 +- > > > hw/block/nvme.c | 375 +++--- > > > hw/block/trace-events | 4 + > > > include/block/nvme.h | 62 ++- > > > 4 files changed, 389 insertions(+), 70 deletions(-) > > > > > > diff --git a/block/nvme.c b/block/nvme.c > > > index d41c4bda6e39..521f521054d5 100644 > > > --- a/block/nvme.c > > > +++ b/block/nvme.c > > > @@ -446,7 +446,7 @@ static void nvme_identify(BlockDriverState *bs, int > > > namespace, Error **errp) > > > error_setg(errp, "Cannot map buffer for DMA"); > > > goto out; > > > } > > > -cmd.prp1 = cpu_to_le64(iova); > > > +cmd.dptr.prp.prp1 = cpu_to_le64(iova); > > > > > > if (nvme_cmd_sync(bs, s->queues[0], )) { > > > error_setg(errp, "Failed to identify controller"); > > > @@ -545,7 +545,7 @@ static bool nvme_add_io_queue(BlockDriverState *bs, > > > Error **errp) > > > } > > > cmd = (NvmeCmd) { > > > .opcode = NVME_ADM_CMD_CREATE_CQ, > > > -.prp1 = cpu_to_le64(q->cq.iova), > > > +.dptr.prp.prp1 = cpu_to_le64(q->cq.iova), > > > .cdw10 = cpu_to_le32(((queue_size - 1) << 16) | (n & 0x)), > > > .cdw11 = cpu_to_le32(0x3), > > > }; > > > @@ -556,7 +556,7 @@ static bool nvme_add_io_queue(BlockDriverState *bs, > > > Error **errp) > > > } > > > cmd = (NvmeCmd) { > > > .opcode = NVME_ADM_CMD_CREATE_SQ, > > > -.prp1 = cpu_to_le64(q->sq.iova), > > > +.dptr.prp.prp1 = cpu_to_le64(q->sq.iova), > > > .cdw10 = cpu_to_le32(((queue_size - 1) << 16) | (n & 0x)), > > > .cdw11 = cpu_to_le32(0x1 | (n << 16)), > > > }; > > > @@ -906,16 +906,16 @@ try_map: > > > case 0: > > > abort(); > > > case 1: > > > -cmd->prp1 = pagelist[0]; > > > -cmd->prp2 = 0; > > > +cmd->dptr.prp.prp1 = pagelist[0]; > > > +cmd->dptr.prp.prp2 = 0; > > > break; > > > case 2: > > > -cmd->prp1 = pagelist[0]; > > > -cmd->prp2 = pagelist[1]; > > > +cmd->dptr.prp.prp1 = pagelist[0]; > > > +cmd->dptr.prp.prp2 = pagelist[1]; > > > break; > > > default: > > > -cmd->prp1 = pagelist[0]; > > > -cmd->prp2 = cpu_to_le64(req->prp_list_iova + sizeof(uint64_t)); > > > +cmd->dptr.prp.prp1 = pagelist[0]; > > > +cmd->dptr.prp.prp2 = cpu_to_le64(req->prp_list_iova + > > > sizeof(uint64_t)); > > > break; > > > } > > > trace_nvme_cmd_map_qiov(s, cmd, req, qiov, entries); > > > diff --git a/hw/block/nvme.c b/hw/block/nvme.c > > > index 204ae1d33234..a91c60fdc111 100644 > > > --- a/hw/block/nvme.c > > > +++ b/hw/block/nvme.c > > > @@ -75,8 +75,10 @@ static inline bool nvme_addr_is_cmb(NvmeCtrl *n, > > > hwaddr addr) > > > > > > static int nvme_addr_read(NvmeCtrl *n, hwaddr addr, void *buf, int size) > > > { > > > -if (n->cmbsz && nvme_addr_is_cmb(n, addr)) { > > > -memcpy(buf, (void *) >cmbuf[addr - n->ctrl_mem.addr], size); > > > +hwaddr hi = addr + size; > > > > Are you sure you don't want to check for overflow here? > > Its theoretical issue since addr has to be almost full 64 bit > > but still for those things I check this very defensively. > > > > The use of nvme_addr_read in map_prp simply cannot overflow due to how > the size is calculated, but for SGLs it's different. But the overflow is > checked in map_sgl because we have to return a special error code in > that case. > > On the other hand there may be other callers of nvme_addr_read in the > future that does not check this, so I'll re-add it. Yep. For security things, as much checks as possible is always good. Thanks! > > > > + > > > +if (n->cmbsz && nvme_addr_is_cmb(n, addr) && nvme_addr_is_cmb(n, > > > hi)) { > > > > Here you fix the bug I mentioned in patch 6. I suggest you to move the fix > > there. > > Done. > > > > +memcpy(buf, nvme_addr_to_cmb(n, addr), size); > > > return 0; > > > } > > > > > > @@ -159,6 +161,48 @@ static void nvme_irq_deassert(NvmeCtrl *n, > > > NvmeCQueue *cq) > > > } > > > } > > > > > > +static uint16_t nvme_map_addr_cmb(NvmeCtrl *n, QEMUIOVector *iov, hwaddr > > > addr, > > > +size_t len) > > > +{ > > > +if (!nvme_addr_is_cmb(n, addr) || !nvme_addr_is_cmb(n, addr + len)) { > > > +return NVME_DATA_TRANSFER_ERROR; > > > +} > > > + > > > +qemu_iovec_add(iov, nvme_addr_to_cmb(n, addr), len); > > > + > > > +return NVME_SUCCESS; > > > +} > > > + > > > +static
Re: [PATCH v5 21/26] nvme: add support for scatter gather lists
On Feb 12 14:07, Maxim Levitsky wrote: > On Tue, 2020-02-04 at 10:52 +0100, Klaus Jensen wrote: > > For now, support the Data Block, Segment and Last Segment descriptor > > types. > > > > See NVM Express 1.3d, Section 4.4 ("Scatter Gather List (SGL)"). > > > > Signed-off-by: Klaus Jensen > > Acked-by: Fam Zheng > > --- > > block/nvme.c | 18 +- > > hw/block/nvme.c | 375 +++--- > > hw/block/trace-events | 4 + > > include/block/nvme.h | 62 ++- > > 4 files changed, 389 insertions(+), 70 deletions(-) > > > > diff --git a/block/nvme.c b/block/nvme.c > > index d41c4bda6e39..521f521054d5 100644 > > --- a/block/nvme.c > > +++ b/block/nvme.c > > @@ -446,7 +446,7 @@ static void nvme_identify(BlockDriverState *bs, int > > namespace, Error **errp) > > error_setg(errp, "Cannot map buffer for DMA"); > > goto out; > > } > > -cmd.prp1 = cpu_to_le64(iova); > > +cmd.dptr.prp.prp1 = cpu_to_le64(iova); > > > > if (nvme_cmd_sync(bs, s->queues[0], )) { > > error_setg(errp, "Failed to identify controller"); > > @@ -545,7 +545,7 @@ static bool nvme_add_io_queue(BlockDriverState *bs, > > Error **errp) > > } > > cmd = (NvmeCmd) { > > .opcode = NVME_ADM_CMD_CREATE_CQ, > > -.prp1 = cpu_to_le64(q->cq.iova), > > +.dptr.prp.prp1 = cpu_to_le64(q->cq.iova), > > .cdw10 = cpu_to_le32(((queue_size - 1) << 16) | (n & 0x)), > > .cdw11 = cpu_to_le32(0x3), > > }; > > @@ -556,7 +556,7 @@ static bool nvme_add_io_queue(BlockDriverState *bs, > > Error **errp) > > } > > cmd = (NvmeCmd) { > > .opcode = NVME_ADM_CMD_CREATE_SQ, > > -.prp1 = cpu_to_le64(q->sq.iova), > > +.dptr.prp.prp1 = cpu_to_le64(q->sq.iova), > > .cdw10 = cpu_to_le32(((queue_size - 1) << 16) | (n & 0x)), > > .cdw11 = cpu_to_le32(0x1 | (n << 16)), > > }; > > @@ -906,16 +906,16 @@ try_map: > > case 0: > > abort(); > > case 1: > > -cmd->prp1 = pagelist[0]; > > -cmd->prp2 = 0; > > +cmd->dptr.prp.prp1 = pagelist[0]; > > +cmd->dptr.prp.prp2 = 0; > > break; > > case 2: > > -cmd->prp1 = pagelist[0]; > > -cmd->prp2 = pagelist[1]; > > +cmd->dptr.prp.prp1 = pagelist[0]; > > +cmd->dptr.prp.prp2 = pagelist[1]; > > break; > > default: > > -cmd->prp1 = pagelist[0]; > > -cmd->prp2 = cpu_to_le64(req->prp_list_iova + sizeof(uint64_t)); > > +cmd->dptr.prp.prp1 = pagelist[0]; > > +cmd->dptr.prp.prp2 = cpu_to_le64(req->prp_list_iova + > > sizeof(uint64_t)); > > break; > > } > > trace_nvme_cmd_map_qiov(s, cmd, req, qiov, entries); > > diff --git a/hw/block/nvme.c b/hw/block/nvme.c > > index 204ae1d33234..a91c60fdc111 100644 > > --- a/hw/block/nvme.c > > +++ b/hw/block/nvme.c > > @@ -75,8 +75,10 @@ static inline bool nvme_addr_is_cmb(NvmeCtrl *n, hwaddr > > addr) > > > > static int nvme_addr_read(NvmeCtrl *n, hwaddr addr, void *buf, int size) > > { > > -if (n->cmbsz && nvme_addr_is_cmb(n, addr)) { > > -memcpy(buf, (void *) >cmbuf[addr - n->ctrl_mem.addr], size); > > +hwaddr hi = addr + size; > Are you sure you don't want to check for overflow here? > Its theoretical issue since addr has to be almost full 64 bit > but still for those things I check this very defensively. > The use of nvme_addr_read in map_prp simply cannot overflow due to how the size is calculated, but for SGLs it's different. But the overflow is checked in map_sgl because we have to return a special error code in that case. On the other hand there may be other callers of nvme_addr_read in the future that does not check this, so I'll re-add it. > > + > > +if (n->cmbsz && nvme_addr_is_cmb(n, addr) && nvme_addr_is_cmb(n, hi)) { > Here you fix the bug I mentioned in patch 6. I suggest you to move the fix > there. Done. > > +memcpy(buf, nvme_addr_to_cmb(n, addr), size); > > return 0; > > } > > > > @@ -159,6 +161,48 @@ static void nvme_irq_deassert(NvmeCtrl *n, NvmeCQueue > > *cq) > > } > > } > > > > +static uint16_t nvme_map_addr_cmb(NvmeCtrl *n, QEMUIOVector *iov, hwaddr > > addr, > > +size_t len) > > +{ > > +if (!nvme_addr_is_cmb(n, addr) || !nvme_addr_is_cmb(n, addr + len)) { > > +return NVME_DATA_TRANSFER_ERROR; > > +} > > + > > +qemu_iovec_add(iov, nvme_addr_to_cmb(n, addr), len); > > + > > +return NVME_SUCCESS; > > +} > > + > > +static uint16_t nvme_map_addr(NvmeCtrl *n, QEMUSGList *qsg, QEMUIOVector > > *iov, > > +hwaddr addr, size_t len) > > +{ > > +bool addr_is_cmb = nvme_addr_is_cmb(n, addr); > > + > > +if (addr_is_cmb) { > > +if (qsg->sg) { > > +return NVME_INVALID_USE_OF_CMB | NVME_DNR; > > +} > > + > > +if (!iov->iov) { > > +qemu_iovec_init(iov, 1); > > +} > > + > > +
Re: [PATCH v5 21/26] nvme: add support for scatter gather lists
On Tue, 2020-02-04 at 10:52 +0100, Klaus Jensen wrote: > For now, support the Data Block, Segment and Last Segment descriptor > types. > > See NVM Express 1.3d, Section 4.4 ("Scatter Gather List (SGL)"). > > Signed-off-by: Klaus Jensen > Acked-by: Fam Zheng > --- > block/nvme.c | 18 +- > hw/block/nvme.c | 375 +++--- > hw/block/trace-events | 4 + > include/block/nvme.h | 62 ++- > 4 files changed, 389 insertions(+), 70 deletions(-) > > diff --git a/block/nvme.c b/block/nvme.c > index d41c4bda6e39..521f521054d5 100644 > --- a/block/nvme.c > +++ b/block/nvme.c > @@ -446,7 +446,7 @@ static void nvme_identify(BlockDriverState *bs, int > namespace, Error **errp) > error_setg(errp, "Cannot map buffer for DMA"); > goto out; > } > -cmd.prp1 = cpu_to_le64(iova); > +cmd.dptr.prp.prp1 = cpu_to_le64(iova); > > if (nvme_cmd_sync(bs, s->queues[0], )) { > error_setg(errp, "Failed to identify controller"); > @@ -545,7 +545,7 @@ static bool nvme_add_io_queue(BlockDriverState *bs, Error > **errp) > } > cmd = (NvmeCmd) { > .opcode = NVME_ADM_CMD_CREATE_CQ, > -.prp1 = cpu_to_le64(q->cq.iova), > +.dptr.prp.prp1 = cpu_to_le64(q->cq.iova), > .cdw10 = cpu_to_le32(((queue_size - 1) << 16) | (n & 0x)), > .cdw11 = cpu_to_le32(0x3), > }; > @@ -556,7 +556,7 @@ static bool nvme_add_io_queue(BlockDriverState *bs, Error > **errp) > } > cmd = (NvmeCmd) { > .opcode = NVME_ADM_CMD_CREATE_SQ, > -.prp1 = cpu_to_le64(q->sq.iova), > +.dptr.prp.prp1 = cpu_to_le64(q->sq.iova), > .cdw10 = cpu_to_le32(((queue_size - 1) << 16) | (n & 0x)), > .cdw11 = cpu_to_le32(0x1 | (n << 16)), > }; > @@ -906,16 +906,16 @@ try_map: > case 0: > abort(); > case 1: > -cmd->prp1 = pagelist[0]; > -cmd->prp2 = 0; > +cmd->dptr.prp.prp1 = pagelist[0]; > +cmd->dptr.prp.prp2 = 0; > break; > case 2: > -cmd->prp1 = pagelist[0]; > -cmd->prp2 = pagelist[1]; > +cmd->dptr.prp.prp1 = pagelist[0]; > +cmd->dptr.prp.prp2 = pagelist[1]; > break; > default: > -cmd->prp1 = pagelist[0]; > -cmd->prp2 = cpu_to_le64(req->prp_list_iova + sizeof(uint64_t)); > +cmd->dptr.prp.prp1 = pagelist[0]; > +cmd->dptr.prp.prp2 = cpu_to_le64(req->prp_list_iova + > sizeof(uint64_t)); > break; > } > trace_nvme_cmd_map_qiov(s, cmd, req, qiov, entries); > diff --git a/hw/block/nvme.c b/hw/block/nvme.c > index 204ae1d33234..a91c60fdc111 100644 > --- a/hw/block/nvme.c > +++ b/hw/block/nvme.c > @@ -75,8 +75,10 @@ static inline bool nvme_addr_is_cmb(NvmeCtrl *n, hwaddr > addr) > > static int nvme_addr_read(NvmeCtrl *n, hwaddr addr, void *buf, int size) > { > -if (n->cmbsz && nvme_addr_is_cmb(n, addr)) { > -memcpy(buf, (void *) >cmbuf[addr - n->ctrl_mem.addr], size); > +hwaddr hi = addr + size; Are you sure you don't want to check for overflow here? Its theoretical issue since addr has to be almost full 64 bit but still for those things I check this very defensively. > + > +if (n->cmbsz && nvme_addr_is_cmb(n, addr) && nvme_addr_is_cmb(n, hi)) { Here you fix the bug I mentioned in patch 6. I suggest you to move the fix there. > +memcpy(buf, nvme_addr_to_cmb(n, addr), size); > return 0; > } > > @@ -159,6 +161,48 @@ static void nvme_irq_deassert(NvmeCtrl *n, NvmeCQueue > *cq) > } > } > > +static uint16_t nvme_map_addr_cmb(NvmeCtrl *n, QEMUIOVector *iov, hwaddr > addr, > +size_t len) > +{ > +if (!nvme_addr_is_cmb(n, addr) || !nvme_addr_is_cmb(n, addr + len)) { > +return NVME_DATA_TRANSFER_ERROR; > +} > + > +qemu_iovec_add(iov, nvme_addr_to_cmb(n, addr), len); > + > +return NVME_SUCCESS; > +} > + > +static uint16_t nvme_map_addr(NvmeCtrl *n, QEMUSGList *qsg, QEMUIOVector > *iov, > +hwaddr addr, size_t len) > +{ > +bool addr_is_cmb = nvme_addr_is_cmb(n, addr); > + > +if (addr_is_cmb) { > +if (qsg->sg) { > +return NVME_INVALID_USE_OF_CMB | NVME_DNR; > +} > + > +if (!iov->iov) { > +qemu_iovec_init(iov, 1); > +} > + > +return nvme_map_addr_cmb(n, iov, addr, len); > +} > + > +if (iov->iov) { > +return NVME_INVALID_USE_OF_CMB | NVME_DNR; > +} > + > +if (!qsg->sg) { > +pci_dma_sglist_init(qsg, >parent_obj, 1); > +} > + > +qemu_sglist_add(qsg, addr, len); > + > +return NVME_SUCCESS; > +} Very good refactoring. I would also suggest you to move this to a separate patch. I always put refactoring first and then patches that add features. > + > static uint16_t nvme_map_prp(NvmeCtrl *n, QEMUSGList *qsg, QEMUIOVector *iov, > uint64_t prp1, uint64_t prp2, uint32_t len, NvmeRequest *req) > { > @@ -307,15 +351,240 @@ unmap:
[PATCH v5 21/26] nvme: add support for scatter gather lists
For now, support the Data Block, Segment and Last Segment descriptor types. See NVM Express 1.3d, Section 4.4 ("Scatter Gather List (SGL)"). Signed-off-by: Klaus Jensen Acked-by: Fam Zheng --- block/nvme.c | 18 +- hw/block/nvme.c | 375 +++--- hw/block/trace-events | 4 + include/block/nvme.h | 62 ++- 4 files changed, 389 insertions(+), 70 deletions(-) diff --git a/block/nvme.c b/block/nvme.c index d41c4bda6e39..521f521054d5 100644 --- a/block/nvme.c +++ b/block/nvme.c @@ -446,7 +446,7 @@ static void nvme_identify(BlockDriverState *bs, int namespace, Error **errp) error_setg(errp, "Cannot map buffer for DMA"); goto out; } -cmd.prp1 = cpu_to_le64(iova); +cmd.dptr.prp.prp1 = cpu_to_le64(iova); if (nvme_cmd_sync(bs, s->queues[0], )) { error_setg(errp, "Failed to identify controller"); @@ -545,7 +545,7 @@ static bool nvme_add_io_queue(BlockDriverState *bs, Error **errp) } cmd = (NvmeCmd) { .opcode = NVME_ADM_CMD_CREATE_CQ, -.prp1 = cpu_to_le64(q->cq.iova), +.dptr.prp.prp1 = cpu_to_le64(q->cq.iova), .cdw10 = cpu_to_le32(((queue_size - 1) << 16) | (n & 0x)), .cdw11 = cpu_to_le32(0x3), }; @@ -556,7 +556,7 @@ static bool nvme_add_io_queue(BlockDriverState *bs, Error **errp) } cmd = (NvmeCmd) { .opcode = NVME_ADM_CMD_CREATE_SQ, -.prp1 = cpu_to_le64(q->sq.iova), +.dptr.prp.prp1 = cpu_to_le64(q->sq.iova), .cdw10 = cpu_to_le32(((queue_size - 1) << 16) | (n & 0x)), .cdw11 = cpu_to_le32(0x1 | (n << 16)), }; @@ -906,16 +906,16 @@ try_map: case 0: abort(); case 1: -cmd->prp1 = pagelist[0]; -cmd->prp2 = 0; +cmd->dptr.prp.prp1 = pagelist[0]; +cmd->dptr.prp.prp2 = 0; break; case 2: -cmd->prp1 = pagelist[0]; -cmd->prp2 = pagelist[1]; +cmd->dptr.prp.prp1 = pagelist[0]; +cmd->dptr.prp.prp2 = pagelist[1]; break; default: -cmd->prp1 = pagelist[0]; -cmd->prp2 = cpu_to_le64(req->prp_list_iova + sizeof(uint64_t)); +cmd->dptr.prp.prp1 = pagelist[0]; +cmd->dptr.prp.prp2 = cpu_to_le64(req->prp_list_iova + sizeof(uint64_t)); break; } trace_nvme_cmd_map_qiov(s, cmd, req, qiov, entries); diff --git a/hw/block/nvme.c b/hw/block/nvme.c index 204ae1d33234..a91c60fdc111 100644 --- a/hw/block/nvme.c +++ b/hw/block/nvme.c @@ -75,8 +75,10 @@ static inline bool nvme_addr_is_cmb(NvmeCtrl *n, hwaddr addr) static int nvme_addr_read(NvmeCtrl *n, hwaddr addr, void *buf, int size) { -if (n->cmbsz && nvme_addr_is_cmb(n, addr)) { -memcpy(buf, (void *) >cmbuf[addr - n->ctrl_mem.addr], size); +hwaddr hi = addr + size; + +if (n->cmbsz && nvme_addr_is_cmb(n, addr) && nvme_addr_is_cmb(n, hi)) { +memcpy(buf, nvme_addr_to_cmb(n, addr), size); return 0; } @@ -159,6 +161,48 @@ static void nvme_irq_deassert(NvmeCtrl *n, NvmeCQueue *cq) } } +static uint16_t nvme_map_addr_cmb(NvmeCtrl *n, QEMUIOVector *iov, hwaddr addr, +size_t len) +{ +if (!nvme_addr_is_cmb(n, addr) || !nvme_addr_is_cmb(n, addr + len)) { +return NVME_DATA_TRANSFER_ERROR; +} + +qemu_iovec_add(iov, nvme_addr_to_cmb(n, addr), len); + +return NVME_SUCCESS; +} + +static uint16_t nvme_map_addr(NvmeCtrl *n, QEMUSGList *qsg, QEMUIOVector *iov, +hwaddr addr, size_t len) +{ +bool addr_is_cmb = nvme_addr_is_cmb(n, addr); + +if (addr_is_cmb) { +if (qsg->sg) { +return NVME_INVALID_USE_OF_CMB | NVME_DNR; +} + +if (!iov->iov) { +qemu_iovec_init(iov, 1); +} + +return nvme_map_addr_cmb(n, iov, addr, len); +} + +if (iov->iov) { +return NVME_INVALID_USE_OF_CMB | NVME_DNR; +} + +if (!qsg->sg) { +pci_dma_sglist_init(qsg, >parent_obj, 1); +} + +qemu_sglist_add(qsg, addr, len); + +return NVME_SUCCESS; +} + static uint16_t nvme_map_prp(NvmeCtrl *n, QEMUSGList *qsg, QEMUIOVector *iov, uint64_t prp1, uint64_t prp2, uint32_t len, NvmeRequest *req) { @@ -307,15 +351,240 @@ unmap: return status; } -static uint16_t nvme_dma_prp(NvmeCtrl *n, uint8_t *ptr, uint32_t len, -uint64_t prp1, uint64_t prp2, DMADirection dir, NvmeRequest *req) +static uint16_t nvme_map_sgl_data(NvmeCtrl *n, QEMUSGList *qsg, +QEMUIOVector *iov, NvmeSglDescriptor *segment, uint64_t nsgld, +uint32_t *len, NvmeRequest *req) +{ +dma_addr_t addr, trans_len; +uint32_t length; +uint16_t status; + +for (int i = 0; i < nsgld; i++) { +uint8_t type = NVME_SGL_TYPE(segment[i].type); + +if (type != NVME_SGL_DESCR_TYPE_DATA_BLOCK) { +switch (type) { +case NVME_SGL_DESCR_TYPE_BIT_BUCKET: +case NVME_SGL_DESCR_TYPE_KEYED_DATA_BLOCK: +return