Re: [PATCH 4/7] block/nvme: switch to a NVMeRequest freelist

2020-05-25 Thread Sergio Lopez
On Tue, May 19, 2020 at 06:11:35PM +0100, Stefan Hajnoczi wrote:
> There are three issues with the current NVMeRequest->busy field:
> 1. The busy field is accidentally accessed outside q->lock when request
>submission fails.
> 2. Waiters on free_req_queue are not woken when a request is returned
>early due to submission failure.
> 2. Finding a free request involves scanning all requests. This makes
>request submission O(n^2).
> 
> Switch to an O(1) freelist that is always accessed under the lock.
> 
> Also differentiate between NVME_QUEUE_SIZE, the actual SQ/CQ size, and
> NVME_NUM_REQS, the number of usable requests. This makes the code
> simpler than using NVME_QUEUE_SIZE everywhere and having to keep in mind
> that one slot is reserved.
> 
> Signed-off-by: Stefan Hajnoczi 
> ---
>  block/nvme.c | 81 ++--
>  1 file changed, 54 insertions(+), 27 deletions(-)

Reviewed-by: Sergio Lopez 


signature.asc
Description: PGP signature


[PATCH 4/7] block/nvme: switch to a NVMeRequest freelist

2020-05-19 Thread Stefan Hajnoczi
There are three issues with the current NVMeRequest->busy field:
1. The busy field is accidentally accessed outside q->lock when request
   submission fails.
2. Waiters on free_req_queue are not woken when a request is returned
   early due to submission failure.
2. Finding a free request involves scanning all requests. This makes
   request submission O(n^2).

Switch to an O(1) freelist that is always accessed under the lock.

Also differentiate between NVME_QUEUE_SIZE, the actual SQ/CQ size, and
NVME_NUM_REQS, the number of usable requests. This makes the code
simpler than using NVME_QUEUE_SIZE everywhere and having to keep in mind
that one slot is reserved.

Signed-off-by: Stefan Hajnoczi 
---
 block/nvme.c | 81 ++--
 1 file changed, 54 insertions(+), 27 deletions(-)

diff --git a/block/nvme.c b/block/nvme.c
index 6bf58bc6aa..3ad4f27e1c 100644
--- a/block/nvme.c
+++ b/block/nvme.c
@@ -33,6 +33,12 @@
 #define NVME_QUEUE_SIZE 128
 #define NVME_BAR_SIZE 8192
 
+/*
+ * We have to leave one slot empty as that is the full queue case where
+ * head == tail + 1.
+ */
+#define NVME_NUM_REQS (NVME_QUEUE_SIZE - 1)
+
 typedef struct {
 int32_t  head, tail;
 uint8_t  *queue;
@@ -47,7 +53,7 @@ typedef struct {
 int cid;
 void *prp_list_page;
 uint64_t prp_list_iova;
-bool busy;
+int free_req_next; /* q->reqs[] index of next free req */
 } NVMeRequest;
 
 typedef struct {
@@ -61,7 +67,8 @@ typedef struct {
 /* Fields protected by @lock */
 NVMeQueue   sq, cq;
 int cq_phase;
-NVMeRequest reqs[NVME_QUEUE_SIZE];
+int free_req_head;
+NVMeRequest reqs[NVME_NUM_REQS];
 boolbusy;
 int need_kick;
 int inflight;
@@ -200,19 +207,23 @@ static NVMeQueuePair 
*nvme_create_queue_pair(BlockDriverState *bs,
 qemu_mutex_init(>lock);
 q->index = idx;
 qemu_co_queue_init(>free_req_queue);
-q->prp_list_pages = qemu_blockalign0(bs, s->page_size * NVME_QUEUE_SIZE);
+q->prp_list_pages = qemu_blockalign0(bs, s->page_size * NVME_NUM_REQS);
 r = qemu_vfio_dma_map(s->vfio, q->prp_list_pages,
-  s->page_size * NVME_QUEUE_SIZE,
+  s->page_size * NVME_NUM_REQS,
   false, _list_iova);
 if (r) {
 goto fail;
 }
-for (i = 0; i < NVME_QUEUE_SIZE; i++) {
+q->free_req_head = -1;
+for (i = 0; i < NVME_NUM_REQS; i++) {
 NVMeRequest *req = >reqs[i];
 req->cid = i + 1;
+req->free_req_next = q->free_req_head;
+q->free_req_head = i;
 req->prp_list_page = q->prp_list_pages + i * s->page_size;
 req->prp_list_iova = prp_list_iova + i * s->page_size;
 }
+
 nvme_init_queue(bs, >sq, size, NVME_SQ_ENTRY_BYTES, _err);
 if (local_err) {
 error_propagate(errp, local_err);
@@ -254,13 +265,11 @@ static void nvme_kick(BDRVNVMeState *s, NVMeQueuePair *q)
  */
 static NVMeRequest *nvme_get_free_req(NVMeQueuePair *q)
 {
-int i;
-NVMeRequest *req = NULL;
+NVMeRequest *req;
 
 qemu_mutex_lock(>lock);
-while (q->inflight + q->need_kick > NVME_QUEUE_SIZE - 2) {
-/* We have to leave one slot empty as that is the full queue case (head
- * == tail + 1). */
+
+while (q->free_req_head == -1) {
 if (qemu_in_coroutine()) {
 trace_nvme_free_req_queue_wait(q);
 qemu_co_queue_wait(>free_req_queue, >lock);
@@ -269,20 +278,41 @@ static NVMeRequest *nvme_get_free_req(NVMeQueuePair *q)
 return NULL;
 }
 }
-for (i = 0; i < NVME_QUEUE_SIZE; i++) {
-if (!q->reqs[i].busy) {
-q->reqs[i].busy = true;
-req = >reqs[i];
-break;
-}
-}
-/* We have checked inflight and need_kick while holding q->lock, so one
- * free req must be available. */
-assert(req);
+
+req = >reqs[q->free_req_head];
+q->free_req_head = req->free_req_next;
+req->free_req_next = -1;
+
 qemu_mutex_unlock(>lock);
 return req;
 }
 
+/* With q->lock */
+static void nvme_put_free_req_locked(NVMeQueuePair *q, NVMeRequest *req)
+{
+req->free_req_next = q->free_req_head;
+q->free_req_head = req - q->reqs;
+}
+
+/* With q->lock */
+static void nvme_wake_free_req_locked(BDRVNVMeState *s, NVMeQueuePair *q)
+{
+if (!qemu_co_queue_empty(>free_req_queue)) {
+replay_bh_schedule_oneshot_event(s->aio_context,
+nvme_free_req_queue_cb, q);
+}
+}
+
+/* Insert a request in the freelist and wake waiters */
+static void nvme_put_free_req_and_wake(BDRVNVMeState *s,  NVMeQueuePair *q,
+   NVMeRequest *req)
+{
+qemu_mutex_lock(>lock);
+nvme_put_free_req_locked(q, req);
+nvme_wake_free_req_locked(s, q);
+qemu_mutex_unlock(>lock);
+}
+
 static inline int nvme_translate_error(const NvmeCqe *c)
 {
 uint16_t status = (le16_to_cpu(c->status) >> 1)