On Tue, 5 Jul 2022 at 10:25, Jinhao Fan <fanjinhao...@ict.ac.cn> wrote: > > Add property "ioeventfd" which is enabled by default. When this is > enabled, updates on the doorbell registers will cause KVM to signal > an event to the QEMU main loop to handle the doorbell updates. > Therefore, instead of letting the vcpu thread run both guest VM and > IO emulation, we now use the main loop thread to do IO emulation and > thus the vcpu thread has more cycles for the guest VM. > > Since ioeventfd does not tell us the exact value that is written, it is > only useful when shadow doorbell buffer is enabled, where we check > for the value in the shadow doorbell buffer when we get the doorbell > update event. > > IOPS comparison on Linux 5.19-rc2: (Unit: KIOPS) > > qd 1 4 16 64 > qemu 35 121 176 153 > ioeventfd 41 133 258 313
Nice > > Changes since v3: > - Do not deregister ioeventfd when it was not enabled on a SQ/CQ > > Signed-off-by: Jinhao Fan <fanjinhao...@ict.ac.cn> > --- > hw/nvme/ctrl.c | 114 ++++++++++++++++++++++++++++++++++++++++++++++++- > hw/nvme/nvme.h | 5 +++ > 2 files changed, 118 insertions(+), 1 deletion(-) > > diff --git a/hw/nvme/ctrl.c b/hw/nvme/ctrl.c > index c952c34f94..4b75c5f549 100644 > --- a/hw/nvme/ctrl.c > +++ b/hw/nvme/ctrl.c > @@ -1374,7 +1374,14 @@ static void nvme_enqueue_req_completion(NvmeCQueue > *cq, NvmeRequest *req) > > QTAILQ_REMOVE(&req->sq->out_req_list, req, entry); > QTAILQ_INSERT_TAIL(&cq->req_list, req, entry); > - timer_mod(cq->timer, qemu_clock_get_ns(QEMU_CLOCK_VIRTUAL) + 500); > + > + if (req->sq->ioeventfd_enabled) { > + /* Post CQE directly since we are in main loop thread */ > + nvme_post_cqes(cq); > + } else { > + /* Schedule the timer to post CQE later since we are in vcpu thread > */ > + timer_mod(cq->timer, qemu_clock_get_ns(QEMU_CLOCK_VIRTUAL) + 500); > + } This change is somewhat independent of ioeventfd. I wonder how much of the performance improvement is due to the sq ioeventfd and how much is due to bypassing the completion timer. In the qd=1 case I expect the completion timer to increase latency and hurt performance. In the qd=64 case the timer increases batching, reduces CPU consumption, and probably improves performance. It would be nice to measure this in isolation, independently of ioeventfd/IOThreads. > } > > static void nvme_process_aers(void *opaque) > @@ -4195,10 +4202,82 @@ static uint16_t nvme_io_cmd(NvmeCtrl *n, NvmeRequest > *req) > return NVME_INVALID_OPCODE | NVME_DNR; > } > > +static void nvme_cq_notifier(EventNotifier *e) > +{ > + NvmeCQueue *cq = container_of(e, NvmeCQueue, notifier); > + NvmeCtrl *n = cq->ctrl; > + > + event_notifier_test_and_clear(&cq->notifier); > + > + nvme_update_cq_head(cq); > + > + if (cq->tail == cq->head) { > + if (cq->irq_enabled) { > + n->cq_pending--; > + } > + > + nvme_irq_deassert(n, cq); > + } > + > + nvme_post_cqes(cq); > +} > + > +static int nvme_init_cq_ioeventfd(NvmeCQueue *cq) > +{ > + NvmeCtrl *n = cq->ctrl; > + uint16_t offset = (cq->cqid << 3) + (1 << 2); Too many... > + int ret; > + > + ret = event_notifier_init(&cq->notifier, 0); > + if (ret < 0) { > + return ret; > + } > + > + event_notifier_set_handler(&cq->notifier, nvme_cq_notifier); > + memory_region_add_eventfd(&n->iomem, > + 0x1000 + offset, 4, false, 0, &cq->notifier); ...magic numbers. Please use constants so it's clear what << 3, (1 << 2), 0x1000, etc mean. > + > + return 0; > +} > + > +static void nvme_sq_notifier(EventNotifier *e) > +{ > + NvmeSQueue *sq = container_of(e, NvmeSQueue, notifier); > + > + event_notifier_test_and_clear(&sq->notifier); > + > + nvme_process_sq(sq); > +} > + > +static int nvme_init_sq_ioeventfd(NvmeSQueue *sq) > +{ > + NvmeCtrl *n = sq->ctrl; > + uint16_t offset = sq->sqid << 3; > + int ret; > + > + ret = event_notifier_init(&sq->notifier, 0); > + if (ret < 0) { > + return ret; > + } > + > + event_notifier_set_handler(&sq->notifier, nvme_sq_notifier); > + memory_region_add_eventfd(&n->iomem, > + 0x1000 + offset, 4, false, 0, &sq->notifier); > + > + return 0; > +} > + > static void nvme_free_sq(NvmeSQueue *sq, NvmeCtrl *n) > { > + uint16_t offset = sq->sqid << 3; > + > n->sq[sq->sqid] = NULL; > timer_free(sq->timer); > + if (sq->ioeventfd_enabled) { > + memory_region_del_eventfd(&n->iomem, > + 0x1000 + offset, 4, false, 0, > &sq->notifier); > + event_notifier_cleanup(&sq->notifier); > + } > g_free(sq->io_req); > if (sq->sqid) { > g_free(sq); > @@ -4271,6 +4350,12 @@ static void nvme_init_sq(NvmeSQueue *sq, NvmeCtrl *n, > uint64_t dma_addr, > if (n->dbbuf_enabled) { > sq->db_addr = n->dbbuf_dbs + (sqid << 3); > sq->ei_addr = n->dbbuf_eis + (sqid << 3); > + > + if (n->params.ioeventfd && sq->sqid != 0) { > + if (!nvme_init_sq_ioeventfd(sq)) { > + sq->ioeventfd_enabled = true; > + } > + } > } > > assert(n->cq[cqid]); > @@ -4575,8 +4660,15 @@ static uint16_t nvme_get_log(NvmeCtrl *n, NvmeRequest > *req) > > static void nvme_free_cq(NvmeCQueue *cq, NvmeCtrl *n) > { > + uint16_t offset = (cq->cqid << 3) + (1 << 2); > + > n->cq[cq->cqid] = NULL; > timer_free(cq->timer); > + if (cq->ioeventfd_enabled) { > + memory_region_del_eventfd(&n->iomem, > + 0x1000 + offset, 4, false, 0, > &cq->notifier); > + event_notifier_cleanup(&cq->notifier); > + } > if (msix_enabled(&n->parent_obj)) { > msix_vector_unuse(&n->parent_obj, cq->vector); > } > @@ -4635,6 +4727,12 @@ static void nvme_init_cq(NvmeCQueue *cq, NvmeCtrl *n, > uint64_t dma_addr, > if (n->dbbuf_enabled) { > cq->db_addr = n->dbbuf_dbs + (cqid << 3) + (1 << 2); > cq->ei_addr = n->dbbuf_eis + (cqid << 3) + (1 << 2); > + > + if (n->params.ioeventfd && cqid != 0) { > + if (!nvme_init_cq_ioeventfd(cq)) { > + cq->ioeventfd_enabled = true; > + } > + } > } > n->cq[cqid] = cq; > cq->timer = timer_new_ns(QEMU_CLOCK_VIRTUAL, nvme_post_cqes, cq); > @@ -5793,6 +5891,7 @@ static uint16_t nvme_dbbuf_config(NvmeCtrl *n, const > NvmeRequest *req) > uint64_t dbs_addr = le64_to_cpu(req->cmd.dptr.prp1); > uint64_t eis_addr = le64_to_cpu(req->cmd.dptr.prp2); > int i; > + int ret; > > /* Address should be page aligned */ > if (dbs_addr & (n->page_size - 1) || eis_addr & (n->page_size - 1)) { > @@ -5818,6 +5917,12 @@ static uint16_t nvme_dbbuf_config(NvmeCtrl *n, const > NvmeRequest *req) > sq->ei_addr = eis_addr + (i << 3); > pci_dma_write(&n->parent_obj, sq->db_addr, &sq->tail, > sizeof(sq->tail)); > + > + if (n->params.ioeventfd && sq->sqid != 0) { > + if (!nvme_init_sq_ioeventfd(sq)) { > + sq->ioeventfd_enabled = true; > + } > + } > } > > if (cq) { > @@ -5826,6 +5931,12 @@ static uint16_t nvme_dbbuf_config(NvmeCtrl *n, const > NvmeRequest *req) > cq->ei_addr = eis_addr + (i << 3) + (1 << 2); > pci_dma_write(&n->parent_obj, cq->db_addr, &cq->head, > sizeof(cq->head)); > + > + if (n->params.ioeventfd && cq->cqid != 0) { > + if (!nvme_init_cq_ioeventfd(cq)) { > + cq->ioeventfd_enabled = true; > + } > + } > } > } > > @@ -7040,6 +7151,7 @@ static Property nvme_props[] = { > DEFINE_PROP_UINT8("zoned.zasl", NvmeCtrl, params.zasl, 0), > DEFINE_PROP_BOOL("zoned.auto_transition", NvmeCtrl, > params.auto_transition_zones, true), > + DEFINE_PROP_BOOL("ioeventfd", NvmeCtrl, params.ioeventfd, true), > DEFINE_PROP_END_OF_LIST(), > }; > > diff --git a/hw/nvme/nvme.h b/hw/nvme/nvme.h > index 4452e4b1bf..2a9beea0c8 100644 > --- a/hw/nvme/nvme.h > +++ b/hw/nvme/nvme.h > @@ -369,6 +369,8 @@ typedef struct NvmeSQueue { > uint64_t db_addr; > uint64_t ei_addr; > QEMUTimer *timer; > + EventNotifier notifier; > + bool ioeventfd_enabled; > NvmeRequest *io_req; > QTAILQ_HEAD(, NvmeRequest) req_list; > QTAILQ_HEAD(, NvmeRequest) out_req_list; > @@ -388,6 +390,8 @@ typedef struct NvmeCQueue { > uint64_t db_addr; > uint64_t ei_addr; > QEMUTimer *timer; > + EventNotifier notifier; > + bool ioeventfd_enabled; > QTAILQ_HEAD(, NvmeSQueue) sq_list; > QTAILQ_HEAD(, NvmeRequest) req_list; > } NvmeCQueue; > @@ -410,6 +414,7 @@ typedef struct NvmeParams { > uint8_t zasl; > bool auto_transition_zones; > bool legacy_cmb; > + bool ioeventfd; > } NvmeParams; > > typedef struct NvmeCtrl { > -- > 2.25.1 > >