On 7/4/20 11:30 PM, Philippe Mathieu-Daudé wrote: > To be able to use multiple queues on the same hardware, > we need to have each queuepair able to receive IRQ > notifications in the correct AIO context. > > The AIO context and the notification handler have to be proper > to each queue, not to the block driver. Move aio_context and > irq_notifier from BDRVNVMeState to NVMeQueuePair. > > Before this patch, only the admin queuepair had an EventNotifier > and was checking all queues when notified by IRQ. > After this patch, each queuepair (admin or io) is hooked with its > own IRQ notifier up to VFIO.
Hmm I should also add a note that we currently only use a single IO queuepair: nvme_add_io_queue() is called once in nvme_init(). Now after this patch, we should be able to call it twice... > > AioContexts must be identical across all queuepairs and > BlockDriverStates. Although they all have their own AioContext > pointer there is no true support for different AioContexts yet. > (For example, nvme_cmd_sync() is called with a bs argument but > AIO_WAIT_WHILE(q->aio_context, ...) uses the queuepair > aio_context so the assumption is that they match.) > > Signed-off-by: Philippe Mathieu-Daudé <phi...@redhat.com> > --- > v3: > - Add notifier to IO queuepairs > - Reword with Stefan help > > I'd like to split this into smaller changes, but I'm not sure > if it is possible... > Maybe move EventNotifier first (keeping aio_context shared), > then move AioContext per queuepair? > --- > block/nvme.c | 102 +++++++++++++++++++++++++-------------------------- > 1 file changed, 51 insertions(+), 51 deletions(-) > > diff --git a/block/nvme.c b/block/nvme.c > index 42c0d5284f..fcf8d93fb2 100644 > --- a/block/nvme.c > +++ b/block/nvme.c > @@ -60,6 +60,8 @@ typedef struct { > > typedef struct { > QemuMutex lock; > + AioContext *aio_context; > + EventNotifier irq_notifier; > > /* Read from I/O code path, initialized under BQL */ > BDRVNVMeState *s; > @@ -107,7 +109,6 @@ QEMU_BUILD_BUG_ON(offsetof(NVMeRegs, doorbells) != > 0x1000); > #define QUEUE_INDEX_IO(n) (1 + n) > > struct BDRVNVMeState { > - AioContext *aio_context; > QEMUVFIOState *vfio; > NVMeRegs *regs; > /* The submission/completion queue pairs. > @@ -120,7 +121,6 @@ struct BDRVNVMeState { > /* How many uint32_t elements does each doorbell entry take. */ > size_t doorbell_scale; > bool write_cache_supported; > - EventNotifier irq_notifier; > > uint64_t nsze; /* Namespace size reported by identify command */ > int nsid; /* The namespace id to read/write data. */ > @@ -227,11 +227,17 @@ static NVMeQueuePair > *nvme_create_queue_pair(BDRVNVMeState *s, > if (!q->prp_list_pages) { > goto fail; > } > + r = event_notifier_init(&q->irq_notifier, 0); > + if (r) { > + error_setg(errp, "Failed to init event notifier"); > + goto fail; > + } > memset(q->prp_list_pages, 0, s->page_size * NVME_QUEUE_SIZE); > qemu_mutex_init(&q->lock); > q->s = s; > q->index = idx; > qemu_co_queue_init(&q->free_req_queue); > + q->aio_context = aio_context; > q->completion_bh = aio_bh_new(aio_context, nvme_process_completion_bh, > q); > r = qemu_vfio_dma_map(s->vfio, q->prp_list_pages, > s->page_size * NVME_NUM_REQS, > @@ -325,7 +331,7 @@ static void nvme_put_free_req_locked(NVMeQueuePair *q, > NVMeRequest *req) > static void nvme_wake_free_req_locked(NVMeQueuePair *q) > { > if (!qemu_co_queue_empty(&q->free_req_queue)) { > - replay_bh_schedule_oneshot_event(q->s->aio_context, > + replay_bh_schedule_oneshot_event(q->aio_context, > nvme_free_req_queue_cb, q); > } > } > @@ -492,7 +498,6 @@ static void nvme_cmd_sync_cb(void *opaque, int ret) > static int nvme_cmd_sync(BlockDriverState *bs, NVMeQueuePair *q, > NvmeCmd *cmd) > { > - AioContext *aio_context = bdrv_get_aio_context(bs); > NVMeRequest *req; > int ret = -EINPROGRESS; > req = nvme_get_free_req(q); > @@ -501,7 +506,7 @@ static int nvme_cmd_sync(BlockDriverState *bs, > NVMeQueuePair *q, > } > nvme_submit_command(q, req, cmd, nvme_cmd_sync_cb, &ret); > > - AIO_WAIT_WHILE(aio_context, ret == -EINPROGRESS); > + AIO_WAIT_WHILE(q->aio_context, ret == -EINPROGRESS); > return ret; > } > > @@ -616,47 +621,35 @@ static bool nvme_poll_queue(NVMeQueuePair *q) > return progress; > } > > -static bool nvme_poll_queues(BDRVNVMeState *s) > -{ > - bool progress = false; > - int i; > - > - for (i = 0; i < s->nr_queues; i++) { > - if (nvme_poll_queue(s->queues[i])) { > - progress = true; > - } > - } > - return progress; > -} > - > static void nvme_handle_event(EventNotifier *n) > { > - BDRVNVMeState *s = container_of(n, BDRVNVMeState, irq_notifier); > + NVMeQueuePair *q = container_of(n, NVMeQueuePair, irq_notifier); > > - trace_nvme_handle_event(s); > + trace_nvme_handle_event(q); > event_notifier_test_and_clear(n); > - nvme_poll_queues(s); > + nvme_poll_queue(q); > } > > static bool nvme_poll_cb(void *opaque) > { > EventNotifier *e = opaque; > - BDRVNVMeState *s = container_of(e, BDRVNVMeState, irq_notifier); > + NVMeQueuePair *q = container_of(e, NVMeQueuePair, irq_notifier); > > - trace_nvme_poll_cb(s); > - return nvme_poll_queues(s); > + trace_nvme_poll_cb(q); > + return nvme_poll_queue(q); > } > > -static bool nvme_add_io_queue(BlockDriverState *bs, Error **errp) > +static bool nvme_add_io_queue(BlockDriverState *bs, > + AioContext *aio_context, Error **errp) > { > BDRVNVMeState *s = bs->opaque; > int n = s->nr_queues; > NVMeQueuePair *q; > NvmeCmd cmd; > int queue_size = NVME_QUEUE_SIZE; > + int ret; > > - q = nvme_create_queue_pair(s, bdrv_get_aio_context(bs), > - n, queue_size, errp); > + q = nvme_create_queue_pair(s, aio_context, n, queue_size, errp); > if (!q) { > return false; > } > @@ -683,6 +676,17 @@ static bool nvme_add_io_queue(BlockDriverState *bs, > Error **errp) > s->queues = g_renew(NVMeQueuePair *, s->queues, n + 1); > s->queues[n] = q; > s->nr_queues++; > + > + ret = qemu_vfio_pci_init_irq(s->vfio, > + &s->queues[n]->irq_notifier, > + VFIO_PCI_MSIX_IRQ_INDEX, errp); > + if (ret) { > + goto out_error; > + } > + aio_set_event_notifier(aio_context, > + &s->queues[n]->irq_notifier, > + false, nvme_handle_event, nvme_poll_cb); > + > return true; > out_error: > nvme_free_queue_pair(q); > @@ -704,12 +708,6 @@ static int nvme_init(BlockDriverState *bs, const char > *device, int namespace, > qemu_co_queue_init(&s->dma_flush_queue); > s->device = g_strdup(device); > s->nsid = namespace; > - s->aio_context = bdrv_get_aio_context(bs); > - ret = event_notifier_init(&s->irq_notifier, 0); > - if (ret) { > - error_setg(errp, "Failed to init event notifier"); > - return ret; > - } > > s->vfio = qemu_vfio_open_pci(device, errp); > if (!s->vfio) { > @@ -784,12 +782,14 @@ static int nvme_init(BlockDriverState *bs, const char > *device, int namespace, > } > } > > - ret = qemu_vfio_pci_init_irq(s->vfio, &s->irq_notifier, > + ret = qemu_vfio_pci_init_irq(s->vfio, > + &s->queues[QUEUE_INDEX_ADMIN]->irq_notifier, > VFIO_PCI_MSIX_IRQ_INDEX, errp); > if (ret) { > goto out; > } > - aio_set_event_notifier(bdrv_get_aio_context(bs), &s->irq_notifier, > + aio_set_event_notifier(aio_context, > + &s->queues[QUEUE_INDEX_ADMIN]->irq_notifier, > false, nvme_handle_event, nvme_poll_cb); > > nvme_identify(bs, namespace, &local_err); > @@ -800,7 +800,7 @@ static int nvme_init(BlockDriverState *bs, const char > *device, int namespace, > } > > /* Set up command queues. */ > - if (!nvme_add_io_queue(bs, errp)) { > + if (!nvme_add_io_queue(bs, aio_context, errp)) { > ret = -EIO; > } > out: > @@ -869,12 +869,14 @@ static void nvme_close(BlockDriverState *bs) > BDRVNVMeState *s = bs->opaque; > > for (i = 0; i < s->nr_queues; ++i) { > - nvme_free_queue_pair(s->queues[i]); > + NVMeQueuePair *q = s->queues[i]; > + > + aio_set_event_notifier(q->aio_context, > + &q->irq_notifier, false, NULL, NULL); > + event_notifier_cleanup(&q->irq_notifier); > + nvme_free_queue_pair(q); > } > g_free(s->queues); > - aio_set_event_notifier(bdrv_get_aio_context(bs), &s->irq_notifier, > - false, NULL, NULL); > - event_notifier_cleanup(&s->irq_notifier); > qemu_vfio_pci_unmap_bar(s->vfio, 0, (void *)s->regs, 0, NVME_BAR_SIZE); > qemu_vfio_close(s->vfio); > > @@ -1086,7 +1088,7 @@ static coroutine_fn int > nvme_co_prw_aligned(BlockDriverState *bs, > .cdw12 = cpu_to_le32(cdw12), > }; > NVMeCoData data = { > - .ctx = bdrv_get_aio_context(bs), > + .ctx = ioq->aio_context, > .ret = -EINPROGRESS, > }; > > @@ -1195,7 +1197,7 @@ static coroutine_fn int nvme_co_flush(BlockDriverState > *bs) > .nsid = cpu_to_le32(s->nsid), > }; > NVMeCoData data = { > - .ctx = bdrv_get_aio_context(bs), > + .ctx = ioq->aio_context, > .ret = -EINPROGRESS, > }; > > @@ -1236,7 +1238,7 @@ static coroutine_fn int > nvme_co_pwrite_zeroes(BlockDriverState *bs, > }; > > NVMeCoData data = { > - .ctx = bdrv_get_aio_context(bs), > + .ctx = ioq->aio_context, > .ret = -EINPROGRESS, > }; > > @@ -1286,7 +1288,7 @@ static int coroutine_fn > nvme_co_pdiscard(BlockDriverState *bs, > }; > > NVMeCoData data = { > - .ctx = bdrv_get_aio_context(bs), > + .ctx = ioq->aio_context, > .ret = -EINPROGRESS, > }; > > @@ -1379,10 +1381,10 @@ static void nvme_detach_aio_context(BlockDriverState > *bs) > > qemu_bh_delete(q->completion_bh); > q->completion_bh = NULL; > - } > > - aio_set_event_notifier(bdrv_get_aio_context(bs), &s->irq_notifier, > - false, NULL, NULL); > + aio_set_event_notifier(bdrv_get_aio_context(bs), &q->irq_notifier, > + false, NULL, NULL); > + } > } > > static void nvme_attach_aio_context(BlockDriverState *bs, > @@ -1390,13 +1392,11 @@ static void nvme_attach_aio_context(BlockDriverState > *bs, > { > BDRVNVMeState *s = bs->opaque; > > - s->aio_context = new_context; > - aio_set_event_notifier(new_context, &s->irq_notifier, > - false, nvme_handle_event, nvme_poll_cb); > - > for (int i = 0; i < s->nr_queues; i++) { > NVMeQueuePair *q = s->queues[i]; > > + aio_set_event_notifier(new_context, &q->irq_notifier, > + false, nvme_handle_event, nvme_poll_cb); > q->completion_bh = > aio_bh_new(new_context, nvme_process_completion_bh, q); > } >