Re: [PATCH v3 12/17] hw/block/nvme: add support for scatter gather lists
On Oct 22 22:51, Philippe Mathieu-Daudé wrote: > On 9/22/20 10:45 AM, Klaus Jensen wrote: > > +static uint16_t nvme_dma(NvmeCtrl *n, uint8_t *ptr, uint32_t len, > > Using a 'void *ptr' would simplify your API, avoiding the callers > to cast. > You are correct, but there is a bunch of work being done on top of nvme-next right now - so even though it is not in master yet, I'd rather not make too many fixups that trickles down like this. But I will queue something up for this. signature.asc Description: PGP signature
Re: [PATCH v3 12/17] hw/block/nvme: add support for scatter gather lists
On 9/22/20 10:45 AM, Klaus Jensen wrote: From: Klaus Jensen For now, support the Data Block, Segment and Last Segment descriptor types. See NVM Express 1.3d, Section 4.4 ("Scatter Gather List (SGL)"). Signed-off-by: Klaus Jensen Reviewed-by: Keith Busch --- include/block/nvme.h | 6 +- hw/block/nvme.c | 329 ++ hw/block/trace-events | 4 + 3 files changed, 279 insertions(+), 60 deletions(-) diff --git a/include/block/nvme.h b/include/block/nvme.h index 65e68a82c897..58647bcdad0b 100644 --- a/include/block/nvme.h +++ b/include/block/nvme.h @@ -412,9 +412,9 @@ typedef union NvmeCmdDptr { } NvmeCmdDptr; enum NvmePsdt { -PSDT_PRP = 0x0, -PSDT_SGL_MPTR_CONTIGUOUS = 0x1, -PSDT_SGL_MPTR_SGL= 0x2, +NVME_PSDT_PRP = 0x0, +NVME_PSDT_SGL_MPTR_CONTIGUOUS = 0x1, +NVME_PSDT_SGL_MPTR_SGL= 0x2, }; typedef struct QEMU_PACKED NvmeCmd { diff --git a/hw/block/nvme.c b/hw/block/nvme.c index 3b901efd1ec0..c5d09ff1edf5 100644 --- a/hw/block/nvme.c +++ b/hw/block/nvme.c @@ -413,13 +413,262 @@ static uint16_t nvme_map_prp(NvmeCtrl *n, uint64_t prp1, uint64_t prp2, return NVME_SUCCESS; } -static uint16_t nvme_dma_prp(NvmeCtrl *n, uint8_t *ptr, uint32_t len, - uint64_t prp1, uint64_t prp2, DMADirection dir, +/* + * Map 'nsgld' data descriptors from 'segment'. The function will subtract the + * number of bytes mapped in len. + */ +static uint16_t nvme_map_sgl_data(NvmeCtrl *n, QEMUSGList *qsg, + QEMUIOVector *iov, + NvmeSglDescriptor *segment, uint64_t nsgld, + size_t *len, NvmeRequest *req) +{ +dma_addr_t addr, trans_len; +uint32_t dlen; +uint16_t status; + +for (int i = 0; i < nsgld; i++) { +uint8_t type = NVME_SGL_TYPE(segment[i].type); + +switch (type) { +case NVME_SGL_DESCR_TYPE_DATA_BLOCK: +break; +case NVME_SGL_DESCR_TYPE_SEGMENT: +case NVME_SGL_DESCR_TYPE_LAST_SEGMENT: +return NVME_INVALID_NUM_SGL_DESCRS | NVME_DNR; +default: +return NVME_SGL_DESCR_TYPE_INVALID | NVME_DNR; +} + +dlen = le32_to_cpu(segment[i].len); +if (!dlen) { +continue; +} + +if (*len == 0) { +/* + * All data has been mapped, but the SGL contains additional + * segments and/or descriptors. The controller might accept + * ignoring the rest of the SGL. + */ +uint16_t sgls = le16_to_cpu(n->id_ctrl.sgls); +if (sgls & NVME_CTRL_SGLS_EXCESS_LENGTH) { +break; +} + +trace_pci_nvme_err_invalid_sgl_excess_length(nvme_cid(req)); +return NVME_DATA_SGL_LEN_INVALID | NVME_DNR; +} + +trans_len = MIN(*len, dlen); +addr = le64_to_cpu(segment[i].addr); + +if (UINT64_MAX - addr < dlen) { +return NVME_DATA_SGL_LEN_INVALID | NVME_DNR; +} + +status = nvme_map_addr(n, qsg, iov, addr, trans_len); +if (status) { +return status; +} + +*len -= trans_len; +} + +return NVME_SUCCESS; +} + +static uint16_t nvme_map_sgl(NvmeCtrl *n, QEMUSGList *qsg, QEMUIOVector *iov, + NvmeSglDescriptor sgl, size_t len, NvmeRequest *req) +{ +/* + * Read the segment in chunks of 256 descriptors (one 4k page) to avoid + * dynamically allocating a potentially huge SGL. The spec allows the SGL + * to be larger (as in number of bytes required to describe the SGL + * descriptors and segment chain) than the command transfer size, so it is + * not bounded by MDTS. + */ +const int SEG_CHUNK_SIZE = 256; + +NvmeSglDescriptor segment[SEG_CHUNK_SIZE], *sgld, *last_sgld; +uint64_t nsgld; +uint32_t seg_len; +uint16_t status; +bool sgl_in_cmb = false; +hwaddr addr; +int ret; + +sgld = +addr = le64_to_cpu(sgl.addr); + +trace_pci_nvme_map_sgl(nvme_cid(req), NVME_SGL_TYPE(sgl.type), len); + +/* + * If the entire transfer can be described with a single data block it can + * be mapped directly. + */ +if (NVME_SGL_TYPE(sgl.type) == NVME_SGL_DESCR_TYPE_DATA_BLOCK) { +status = nvme_map_sgl_data(n, qsg, iov, sgld, 1, , req); +if (status) { +goto unmap; +} + +goto out; +} + +/* + * If the segment is located in the CMB, the submission queue of the + * request must also reside there. + */ +if (nvme_addr_is_cmb(n, addr)) { +if (!nvme_addr_is_cmb(n, req->sq->dma_addr)) { +return NVME_INVALID_USE_OF_CMB | NVME_DNR; +} + +sgl_in_cmb = true; +} + +for (;;) { +switch (NVME_SGL_TYPE(sgld->type)) { +
[PATCH v3 12/17] hw/block/nvme: add support for scatter gather lists
From: Klaus Jensen For now, support the Data Block, Segment and Last Segment descriptor types. See NVM Express 1.3d, Section 4.4 ("Scatter Gather List (SGL)"). Signed-off-by: Klaus Jensen Reviewed-by: Keith Busch --- include/block/nvme.h | 6 +- hw/block/nvme.c | 329 ++ hw/block/trace-events | 4 + 3 files changed, 279 insertions(+), 60 deletions(-) diff --git a/include/block/nvme.h b/include/block/nvme.h index 65e68a82c897..58647bcdad0b 100644 --- a/include/block/nvme.h +++ b/include/block/nvme.h @@ -412,9 +412,9 @@ typedef union NvmeCmdDptr { } NvmeCmdDptr; enum NvmePsdt { -PSDT_PRP = 0x0, -PSDT_SGL_MPTR_CONTIGUOUS = 0x1, -PSDT_SGL_MPTR_SGL= 0x2, +NVME_PSDT_PRP = 0x0, +NVME_PSDT_SGL_MPTR_CONTIGUOUS = 0x1, +NVME_PSDT_SGL_MPTR_SGL= 0x2, }; typedef struct QEMU_PACKED NvmeCmd { diff --git a/hw/block/nvme.c b/hw/block/nvme.c index 3b901efd1ec0..c5d09ff1edf5 100644 --- a/hw/block/nvme.c +++ b/hw/block/nvme.c @@ -413,13 +413,262 @@ static uint16_t nvme_map_prp(NvmeCtrl *n, uint64_t prp1, uint64_t prp2, return NVME_SUCCESS; } -static uint16_t nvme_dma_prp(NvmeCtrl *n, uint8_t *ptr, uint32_t len, - uint64_t prp1, uint64_t prp2, DMADirection dir, +/* + * Map 'nsgld' data descriptors from 'segment'. The function will subtract the + * number of bytes mapped in len. + */ +static uint16_t nvme_map_sgl_data(NvmeCtrl *n, QEMUSGList *qsg, + QEMUIOVector *iov, + NvmeSglDescriptor *segment, uint64_t nsgld, + size_t *len, NvmeRequest *req) +{ +dma_addr_t addr, trans_len; +uint32_t dlen; +uint16_t status; + +for (int i = 0; i < nsgld; i++) { +uint8_t type = NVME_SGL_TYPE(segment[i].type); + +switch (type) { +case NVME_SGL_DESCR_TYPE_DATA_BLOCK: +break; +case NVME_SGL_DESCR_TYPE_SEGMENT: +case NVME_SGL_DESCR_TYPE_LAST_SEGMENT: +return NVME_INVALID_NUM_SGL_DESCRS | NVME_DNR; +default: +return NVME_SGL_DESCR_TYPE_INVALID | NVME_DNR; +} + +dlen = le32_to_cpu(segment[i].len); +if (!dlen) { +continue; +} + +if (*len == 0) { +/* + * All data has been mapped, but the SGL contains additional + * segments and/or descriptors. The controller might accept + * ignoring the rest of the SGL. + */ +uint16_t sgls = le16_to_cpu(n->id_ctrl.sgls); +if (sgls & NVME_CTRL_SGLS_EXCESS_LENGTH) { +break; +} + +trace_pci_nvme_err_invalid_sgl_excess_length(nvme_cid(req)); +return NVME_DATA_SGL_LEN_INVALID | NVME_DNR; +} + +trans_len = MIN(*len, dlen); +addr = le64_to_cpu(segment[i].addr); + +if (UINT64_MAX - addr < dlen) { +return NVME_DATA_SGL_LEN_INVALID | NVME_DNR; +} + +status = nvme_map_addr(n, qsg, iov, addr, trans_len); +if (status) { +return status; +} + +*len -= trans_len; +} + +return NVME_SUCCESS; +} + +static uint16_t nvme_map_sgl(NvmeCtrl *n, QEMUSGList *qsg, QEMUIOVector *iov, + NvmeSglDescriptor sgl, size_t len, NvmeRequest *req) +{ +/* + * Read the segment in chunks of 256 descriptors (one 4k page) to avoid + * dynamically allocating a potentially huge SGL. The spec allows the SGL + * to be larger (as in number of bytes required to describe the SGL + * descriptors and segment chain) than the command transfer size, so it is + * not bounded by MDTS. + */ +const int SEG_CHUNK_SIZE = 256; + +NvmeSglDescriptor segment[SEG_CHUNK_SIZE], *sgld, *last_sgld; +uint64_t nsgld; +uint32_t seg_len; +uint16_t status; +bool sgl_in_cmb = false; +hwaddr addr; +int ret; + +sgld = +addr = le64_to_cpu(sgl.addr); + +trace_pci_nvme_map_sgl(nvme_cid(req), NVME_SGL_TYPE(sgl.type), len); + +/* + * If the entire transfer can be described with a single data block it can + * be mapped directly. + */ +if (NVME_SGL_TYPE(sgl.type) == NVME_SGL_DESCR_TYPE_DATA_BLOCK) { +status = nvme_map_sgl_data(n, qsg, iov, sgld, 1, , req); +if (status) { +goto unmap; +} + +goto out; +} + +/* + * If the segment is located in the CMB, the submission queue of the + * request must also reside there. + */ +if (nvme_addr_is_cmb(n, addr)) { +if (!nvme_addr_is_cmb(n, req->sq->dma_addr)) { +return NVME_INVALID_USE_OF_CMB | NVME_DNR; +} + +sgl_in_cmb = true; +} + +for (;;) { +switch (NVME_SGL_TYPE(sgld->type)) { +case NVME_SGL_DESCR_TYPE_SEGMENT: +case