Hi Klaus,
On Thu, 19 Dec 2019 at 13:09, Klaus Jensen wrote:
>
> This refactors how the device issues asynchronous block backend
> requests. The NvmeRequest now holds a queue of NvmeAIOs that are
> associated with the command. This allows multiple aios to be issued for
> a command. Only when all requests have been completed will the device
> post a completion queue entry.
>
> Because the device is currently guaranteed to only issue a single aio
> request per command, the benefit is not immediately obvious. But this
> functionality is required to support metadata, the dataset management
> command and other features.
>
> Signed-off-by: Klaus Jensen
> Signed-off-by: Klaus Jensen
> ---
> hw/block/nvme.c | 422 ++
> hw/block/nvme.h | 126 +++--
> hw/block/trace-events | 8 +
> 3 files changed, 461 insertions(+), 95 deletions(-)
>
> diff --git a/hw/block/nvme.c b/hw/block/nvme.c
> index be554ae1e94c..56659bbe263a 100644
> --- a/hw/block/nvme.c
> +++ b/hw/block/nvme.c
> @@ -19,7 +19,8 @@
> * -drive file=,if=none,id=
> * -device nvme,drive=,serial=,id=, \
> * cmb_size_mb=, \
> - * num_queues=
> + * num_queues=, \
> + * mdts=
> *
> * Note cmb_size_mb denotes size of CMB in MB. CMB is assumed to be at
> * offset 0 in BAR2 and supports only WDS, RDS and SQS for now.
> @@ -55,6 +56,7 @@
> } while (0)
>
> static void nvme_process_sq(void *opaque);
> +static void nvme_aio_cb(void *opaque, int ret);
>
> static inline void *nvme_addr_to_cmb(NvmeCtrl *n, hwaddr addr)
> {
> @@ -339,6 +341,116 @@ static uint16_t nvme_dma_prp(NvmeCtrl *n, uint8_t *ptr,
> uint32_t len,
> return status;
> }
>
> +static uint16_t nvme_map(NvmeCtrl *n, NvmeCmd *cmd, NvmeRequest *req)
> +{
> +NvmeNamespace *ns = req->ns;
> +
> +uint32_t len = req->nlb << nvme_ns_lbads(ns);
> +uint64_t prp1 = le64_to_cpu(cmd->prp1);
> +uint64_t prp2 = le64_to_cpu(cmd->prp2);
> +
> +return nvme_map_prp(n, >qsg, >iov, prp1, prp2, len, req);
> +}
> +
> +static void nvme_aio_destroy(NvmeAIO *aio)
> +{
> +g_free(aio);
> +}
> +
> +static NvmeAIO *nvme_aio_new(BlockBackend *blk, int64_t offset, size_t len,
> +QEMUSGList *qsg, QEMUIOVector *iov, NvmeRequest *req,
> +NvmeAIOCompletionFunc *cb)
Minor: The indentation here (and in a few other places across the patchset)
does not seem right . And maybe inline ?
Also : seems that there are cases when some of the parameters are
not required (NULL) , maybe having a simplified version for those cases
might be useful ?
> +{
> +NvmeAIO *aio = g_malloc0(sizeof(*aio));
> +
> +*aio = (NvmeAIO) {
> +.blk = blk,
> +.offset = offset,
> +.len = len,
> +.req = req,
> +.qsg = qsg,
> +.iov = iov,
> +.cb = cb,
> +};
> +
> +return aio;
> +}
> +
> +static inline void nvme_req_register_aio(NvmeRequest *req, NvmeAIO *aio,
> +NvmeAIOOp opc)
> +{
> +aio->opc = opc;
> +
> +trace_nvme_dev_req_register_aio(nvme_cid(req), aio, blk_name(aio->blk),
> +aio->offset, aio->len, nvme_aio_opc_str(aio), req);
> +
> +if (req) {
> +QTAILQ_INSERT_TAIL(>aio_tailq, aio, tailq_entry);
> +}
> +}
> +
> +static void nvme_aio(NvmeAIO *aio)
> +{
> +BlockBackend *blk = aio->blk;
> +BlockAcctCookie *acct = >acct;
> +BlockAcctStats *stats = blk_get_stats(blk);
> +
> +bool is_write, dma;
> +
> +switch (aio->opc) {
> +case NVME_AIO_OPC_NONE:
> +break;
> +
> +case NVME_AIO_OPC_FLUSH:
> +block_acct_start(stats, acct, 0, BLOCK_ACCT_FLUSH);
> +aio->aiocb = blk_aio_flush(blk, nvme_aio_cb, aio);
> +break;
> +
> +case NVME_AIO_OPC_WRITE_ZEROES:
> +block_acct_start(stats, acct, aio->len, BLOCK_ACCT_WRITE);
> +aio->aiocb = blk_aio_pwrite_zeroes(blk, aio->offset, aio->len,
> +BDRV_REQ_MAY_UNMAP, nvme_aio_cb, aio);
> +break;
> +
> +case NVME_AIO_OPC_READ:
> +case NVME_AIO_OPC_WRITE:
> +dma = aio->qsg != NULL;
> +is_write = (aio->opc == NVME_AIO_OPC_WRITE);
> +
> +block_acct_start(stats, acct, aio->len,
> +is_write ? BLOCK_ACCT_WRITE : BLOCK_ACCT_READ);
> +
> +if (dma) {
> +aio->aiocb = is_write ?
> +dma_blk_write(blk, aio->qsg, aio->offset,
> +BDRV_SECTOR_SIZE, nvme_aio_cb, aio) :
> +dma_blk_read(blk, aio->qsg, aio->offset,
> +BDRV_SECTOR_SIZE, nvme_aio_cb, aio);
> +
> +return;
> +}
> +
> +aio->aiocb = is_write ?
> +blk_aio_pwritev(blk, aio->offset, aio->iov, 0,
> +nvme_aio_cb, aio) :
> +blk_aio_preadv(blk, aio->offset, aio->iov, 0,
> +nvme_aio_cb, aio);
> +
> +break;
> +}
> +}
> +
> +static void nvme_rw_aio(BlockBackend *blk, uint64_t offset, NvmeRequest *req)
> +{
> +