On 7/4/20 11:30 PM, Philippe Mathieu-Daudé wrote:
> To be able to use multiple queues on the same hardware,
> we need to have each queuepair able to receive IRQ
> notifications in the correct AIO context.
> 
> The AIO context and the notification handler have to be proper
> to each queue, not to the block driver. Move aio_context and
> irq_notifier from BDRVNVMeState to NVMeQueuePair.
> 
> Before this patch, only the admin queuepair had an EventNotifier
> and was checking all queues when notified by IRQ.
> After this patch, each queuepair (admin or io) is hooked with its
> own IRQ notifier up to VFIO.

Hmm I should also add a note that we currently only use a single IO
queuepair: nvme_add_io_queue() is called once in nvme_init().

Now after this patch, we should be able to call it twice...

> 
> AioContexts must be identical across all queuepairs and
> BlockDriverStates. Although they all have their own AioContext
> pointer there is no true support for different AioContexts yet.
> (For example, nvme_cmd_sync() is called with a bs argument but
> AIO_WAIT_WHILE(q->aio_context, ...) uses the queuepair
> aio_context so the assumption is that they match.)
> 
> Signed-off-by: Philippe Mathieu-Daudé <phi...@redhat.com>
> ---
> v3:
> - Add notifier to IO queuepairs
> - Reword with Stefan help
> 
> I'd like to split this into smaller changes, but I'm not sure
> if it is possible...
> Maybe move EventNotifier first (keeping aio_context shared),
> then move AioContext per queuepair?
> ---
>  block/nvme.c | 102 +++++++++++++++++++++++++--------------------------
>  1 file changed, 51 insertions(+), 51 deletions(-)
> 
> diff --git a/block/nvme.c b/block/nvme.c
> index 42c0d5284f..fcf8d93fb2 100644
> --- a/block/nvme.c
> +++ b/block/nvme.c
> @@ -60,6 +60,8 @@ typedef struct {
>  
>  typedef struct {
>      QemuMutex   lock;
> +    AioContext *aio_context;
> +    EventNotifier irq_notifier;
>  
>      /* Read from I/O code path, initialized under BQL */
>      BDRVNVMeState   *s;
> @@ -107,7 +109,6 @@ QEMU_BUILD_BUG_ON(offsetof(NVMeRegs, doorbells) != 
> 0x1000);
>  #define QUEUE_INDEX_IO(n)   (1 + n)
>  
>  struct BDRVNVMeState {
> -    AioContext *aio_context;
>      QEMUVFIOState *vfio;
>      NVMeRegs *regs;
>      /* The submission/completion queue pairs.
> @@ -120,7 +121,6 @@ struct BDRVNVMeState {
>      /* How many uint32_t elements does each doorbell entry take. */
>      size_t doorbell_scale;
>      bool write_cache_supported;
> -    EventNotifier irq_notifier;
>  
>      uint64_t nsze; /* Namespace size reported by identify command */
>      int nsid;      /* The namespace id to read/write data. */
> @@ -227,11 +227,17 @@ static NVMeQueuePair 
> *nvme_create_queue_pair(BDRVNVMeState *s,
>      if (!q->prp_list_pages) {
>          goto fail;
>      }
> +    r = event_notifier_init(&q->irq_notifier, 0);
> +    if (r) {
> +        error_setg(errp, "Failed to init event notifier");
> +        goto fail;
> +    }
>      memset(q->prp_list_pages, 0, s->page_size * NVME_QUEUE_SIZE);
>      qemu_mutex_init(&q->lock);
>      q->s = s;
>      q->index = idx;
>      qemu_co_queue_init(&q->free_req_queue);
> +    q->aio_context = aio_context;
>      q->completion_bh = aio_bh_new(aio_context, nvme_process_completion_bh, 
> q);
>      r = qemu_vfio_dma_map(s->vfio, q->prp_list_pages,
>                            s->page_size * NVME_NUM_REQS,
> @@ -325,7 +331,7 @@ static void nvme_put_free_req_locked(NVMeQueuePair *q, 
> NVMeRequest *req)
>  static void nvme_wake_free_req_locked(NVMeQueuePair *q)
>  {
>      if (!qemu_co_queue_empty(&q->free_req_queue)) {
> -        replay_bh_schedule_oneshot_event(q->s->aio_context,
> +        replay_bh_schedule_oneshot_event(q->aio_context,
>                  nvme_free_req_queue_cb, q);
>      }
>  }
> @@ -492,7 +498,6 @@ static void nvme_cmd_sync_cb(void *opaque, int ret)
>  static int nvme_cmd_sync(BlockDriverState *bs, NVMeQueuePair *q,
>                           NvmeCmd *cmd)
>  {
> -    AioContext *aio_context = bdrv_get_aio_context(bs);
>      NVMeRequest *req;
>      int ret = -EINPROGRESS;
>      req = nvme_get_free_req(q);
> @@ -501,7 +506,7 @@ static int nvme_cmd_sync(BlockDriverState *bs, 
> NVMeQueuePair *q,
>      }
>      nvme_submit_command(q, req, cmd, nvme_cmd_sync_cb, &ret);
>  
> -    AIO_WAIT_WHILE(aio_context, ret == -EINPROGRESS);
> +    AIO_WAIT_WHILE(q->aio_context, ret == -EINPROGRESS);
>      return ret;
>  }
>  
> @@ -616,47 +621,35 @@ static bool nvme_poll_queue(NVMeQueuePair *q)
>      return progress;
>  }
>  
> -static bool nvme_poll_queues(BDRVNVMeState *s)
> -{
> -    bool progress = false;
> -    int i;
> -
> -    for (i = 0; i < s->nr_queues; i++) {
> -        if (nvme_poll_queue(s->queues[i])) {
> -            progress = true;
> -        }
> -    }
> -    return progress;
> -}
> -
>  static void nvme_handle_event(EventNotifier *n)
>  {
> -    BDRVNVMeState *s = container_of(n, BDRVNVMeState, irq_notifier);
> +    NVMeQueuePair *q = container_of(n, NVMeQueuePair, irq_notifier);
>  
> -    trace_nvme_handle_event(s);
> +    trace_nvme_handle_event(q);
>      event_notifier_test_and_clear(n);
> -    nvme_poll_queues(s);
> +    nvme_poll_queue(q);
>  }
>  
>  static bool nvme_poll_cb(void *opaque)
>  {
>      EventNotifier *e = opaque;
> -    BDRVNVMeState *s = container_of(e, BDRVNVMeState, irq_notifier);
> +    NVMeQueuePair *q = container_of(e, NVMeQueuePair, irq_notifier);
>  
> -    trace_nvme_poll_cb(s);
> -    return nvme_poll_queues(s);
> +    trace_nvme_poll_cb(q);
> +    return nvme_poll_queue(q);
>  }
>  
> -static bool nvme_add_io_queue(BlockDriverState *bs, Error **errp)
> +static bool nvme_add_io_queue(BlockDriverState *bs,
> +                              AioContext *aio_context, Error **errp)
>  {
>      BDRVNVMeState *s = bs->opaque;
>      int n = s->nr_queues;
>      NVMeQueuePair *q;
>      NvmeCmd cmd;
>      int queue_size = NVME_QUEUE_SIZE;
> +    int ret;
>  
> -    q = nvme_create_queue_pair(s, bdrv_get_aio_context(bs),
> -                               n, queue_size, errp);
> +    q = nvme_create_queue_pair(s, aio_context, n, queue_size, errp);
>      if (!q) {
>          return false;
>      }
> @@ -683,6 +676,17 @@ static bool nvme_add_io_queue(BlockDriverState *bs, 
> Error **errp)
>      s->queues = g_renew(NVMeQueuePair *, s->queues, n + 1);
>      s->queues[n] = q;
>      s->nr_queues++;
> +
> +    ret = qemu_vfio_pci_init_irq(s->vfio,
> +                                 &s->queues[n]->irq_notifier,
> +                                 VFIO_PCI_MSIX_IRQ_INDEX, errp);
> +    if (ret) {
> +        goto out_error;
> +    }
> +    aio_set_event_notifier(aio_context,
> +                           &s->queues[n]->irq_notifier,
> +                           false, nvme_handle_event, nvme_poll_cb);
> +
>      return true;
>  out_error:
>      nvme_free_queue_pair(q);
> @@ -704,12 +708,6 @@ static int nvme_init(BlockDriverState *bs, const char 
> *device, int namespace,
>      qemu_co_queue_init(&s->dma_flush_queue);
>      s->device = g_strdup(device);
>      s->nsid = namespace;
> -    s->aio_context = bdrv_get_aio_context(bs);
> -    ret = event_notifier_init(&s->irq_notifier, 0);
> -    if (ret) {
> -        error_setg(errp, "Failed to init event notifier");
> -        return ret;
> -    }
>  
>      s->vfio = qemu_vfio_open_pci(device, errp);
>      if (!s->vfio) {
> @@ -784,12 +782,14 @@ static int nvme_init(BlockDriverState *bs, const char 
> *device, int namespace,
>          }
>      }
>  
> -    ret = qemu_vfio_pci_init_irq(s->vfio, &s->irq_notifier,
> +    ret = qemu_vfio_pci_init_irq(s->vfio,
> +                                 &s->queues[QUEUE_INDEX_ADMIN]->irq_notifier,
>                                   VFIO_PCI_MSIX_IRQ_INDEX, errp);
>      if (ret) {
>          goto out;
>      }
> -    aio_set_event_notifier(bdrv_get_aio_context(bs), &s->irq_notifier,
> +    aio_set_event_notifier(aio_context,
> +                           &s->queues[QUEUE_INDEX_ADMIN]->irq_notifier,
>                             false, nvme_handle_event, nvme_poll_cb);
>  
>      nvme_identify(bs, namespace, &local_err);
> @@ -800,7 +800,7 @@ static int nvme_init(BlockDriverState *bs, const char 
> *device, int namespace,
>      }
>  
>      /* Set up command queues. */
> -    if (!nvme_add_io_queue(bs, errp)) {
> +    if (!nvme_add_io_queue(bs, aio_context, errp)) {
>          ret = -EIO;
>      }
>  out:
> @@ -869,12 +869,14 @@ static void nvme_close(BlockDriverState *bs)
>      BDRVNVMeState *s = bs->opaque;
>  
>      for (i = 0; i < s->nr_queues; ++i) {
> -        nvme_free_queue_pair(s->queues[i]);
> +        NVMeQueuePair *q = s->queues[i];
> +
> +        aio_set_event_notifier(q->aio_context,
> +                               &q->irq_notifier, false, NULL, NULL);
> +        event_notifier_cleanup(&q->irq_notifier);
> +        nvme_free_queue_pair(q);
>      }
>      g_free(s->queues);
> -    aio_set_event_notifier(bdrv_get_aio_context(bs), &s->irq_notifier,
> -                           false, NULL, NULL);
> -    event_notifier_cleanup(&s->irq_notifier);
>      qemu_vfio_pci_unmap_bar(s->vfio, 0, (void *)s->regs, 0, NVME_BAR_SIZE);
>      qemu_vfio_close(s->vfio);
>  
> @@ -1086,7 +1088,7 @@ static coroutine_fn int 
> nvme_co_prw_aligned(BlockDriverState *bs,
>          .cdw12 = cpu_to_le32(cdw12),
>      };
>      NVMeCoData data = {
> -        .ctx = bdrv_get_aio_context(bs),
> +        .ctx = ioq->aio_context,
>          .ret = -EINPROGRESS,
>      };
>  
> @@ -1195,7 +1197,7 @@ static coroutine_fn int nvme_co_flush(BlockDriverState 
> *bs)
>          .nsid = cpu_to_le32(s->nsid),
>      };
>      NVMeCoData data = {
> -        .ctx = bdrv_get_aio_context(bs),
> +        .ctx = ioq->aio_context,
>          .ret = -EINPROGRESS,
>      };
>  
> @@ -1236,7 +1238,7 @@ static coroutine_fn int 
> nvme_co_pwrite_zeroes(BlockDriverState *bs,
>      };
>  
>      NVMeCoData data = {
> -        .ctx = bdrv_get_aio_context(bs),
> +        .ctx = ioq->aio_context,
>          .ret = -EINPROGRESS,
>      };
>  
> @@ -1286,7 +1288,7 @@ static int coroutine_fn 
> nvme_co_pdiscard(BlockDriverState *bs,
>      };
>  
>      NVMeCoData data = {
> -        .ctx = bdrv_get_aio_context(bs),
> +        .ctx = ioq->aio_context,
>          .ret = -EINPROGRESS,
>      };
>  
> @@ -1379,10 +1381,10 @@ static void nvme_detach_aio_context(BlockDriverState 
> *bs)
>  
>          qemu_bh_delete(q->completion_bh);
>          q->completion_bh = NULL;
> -    }
>  
> -    aio_set_event_notifier(bdrv_get_aio_context(bs), &s->irq_notifier,
> -                           false, NULL, NULL);
> +        aio_set_event_notifier(bdrv_get_aio_context(bs), &q->irq_notifier,
> +                               false, NULL, NULL);
> +    }
>  }
>  
>  static void nvme_attach_aio_context(BlockDriverState *bs,
> @@ -1390,13 +1392,11 @@ static void nvme_attach_aio_context(BlockDriverState 
> *bs,
>  {
>      BDRVNVMeState *s = bs->opaque;
>  
> -    s->aio_context = new_context;
> -    aio_set_event_notifier(new_context, &s->irq_notifier,
> -                           false, nvme_handle_event, nvme_poll_cb);
> -
>      for (int i = 0; i < s->nr_queues; i++) {
>          NVMeQueuePair *q = s->queues[i];
>  
> +        aio_set_event_notifier(new_context, &q->irq_notifier,
> +                               false, nvme_handle_event, nvme_poll_cb);
>          q->completion_bh =
>              aio_bh_new(new_context, nvme_process_completion_bh, q);
>      }
> 


Reply via email to