Il 28/08/2012 15:37, Stefan Hajnoczi ha scritto:
>> > The "right fix" would not be much more complex though, something like 
>> > this, right?
>> > (untested).
> Yes but it's more complicated.  To do a really good job we should
> slice off the first/last clusters if they are unaligned, handle them
> like regular allocating writes, and handle the middle of the request
> as a zero write.
> 
> I decided to do the simplest implementation since this scenario only
> occurs in test cases, not real guests.

Yes, I was curious because it reminded me of the patch I did to write
zeroes when I was playing with discard to avoid the large bounce buffer
in qed_aio_write_inplace.  That patch takes care of processing clusters
one by one (though that means one L2 write for each and every cluster,
not just the first and last).

It probably causes a performance hit, but anyway I attach it for
completeness.

Paolo
>From 98f2978ae5d0f34dca0369fcc727d1e533c0e6b0 Mon Sep 17 00:00:00 2001
From: Paolo Bonzini <pbonz...@redhat.com>
Date: Thu, 8 Mar 2012 14:48:29 +0100
Subject: [PATCH 1/2] qed: make write-zeroes bounce buffer smaller than a
 single cluster

Currently, a write-zeroes operation could allocates memory for the whole
I/O operation if it is not aligned.  This is not necessary, because only
two unaligned clusters could be written.

This makes the write-zeroes operation proceed one cluster at a time,
even if all clusters are currently available and zero.  This does cause
worse performance due to multiple L2 table writes.  However, if zero-write
detection is enabled it means we're not interested in maximum performance.

Signed-off-by: Paolo Bonzini <pbonz...@redhat.com>
---
 block/qed.c | 75 +++++++++++++++++++++++++++++++++++++------------------------
 1 file modificato, 46 inserzioni(+), 29 rimozioni(-)

diff --git a/block/qed.c b/block/qed.c
index a02dbfd..bcea346 100644
--- a/block/qed.c
+++ b/block/qed.c
@@ -869,7 +869,9 @@ static void qed_aio_complete(QEDAIOCB *acb, int ret)
     /* Free the buffer we may have allocated for zero writes */
     if (acb->flags & QED_AIOCB_ZERO) {
         qemu_vfree(acb->qiov->iov[0].iov_base);
-        acb->qiov->iov[0].iov_base = NULL;
+        qemu_iovec_destroy(acb->qiov);
+        g_free(acb->qiov);
+        acb->qiov = NULL;
     }
 
     /* Arrange for a bh to invoke the completion function */
@@ -1096,6 +1098,34 @@ static void qed_aio_write_zero_cluster(void *opaque, int ret)
 }
 
 /**
+ * Calculate the I/O vector
+ *
+ * @acb:        Write request
+ * @len:        Length in bytes
+ */
+static void qed_prepare_qiov(QEDAIOCB *acb, size_t len)
+{
+    /* Calculate the I/O vector */
+    if (acb->flags & QED_AIOCB_ZERO) {
+        /* Allocate buffer for zero writes */
+        if (!acb->qiov) {
+            BDRVQEDState *s = acb_to_s(acb);
+            char *base;
+
+            acb->qiov = g_malloc(sizeof(QEMUIOVector));
+            base = qemu_blockalign(s->bs, s->header.cluster_size);
+            qemu_iovec_init(acb->qiov, 1);
+            qemu_iovec_add(acb->qiov, base, s->header.cluster_size);
+            memset(base, 0, s->header.cluster_size);
+        }
+        assert(len <= acb->qiov->size);
+        qemu_iovec_add(&acb->cur_qiov, acb->qiov->iov[0].iov_base, len);
+    } else {
+        qemu_iovec_concat(&acb->cur_qiov, acb->qiov, acb->qiov_offset, len);
+    }
+}
+
+/**
  * Write new data cluster
  *
  * @acb:        Write request
@@ -1124,21 +1154,20 @@ static void qed_aio_write_alloc(QEDAIOCB *acb, size_t len)
 
     acb->cur_nclusters = qed_bytes_to_clusters(s,
             qed_offset_into_cluster(s, acb->cur_pos) + len);
-    qemu_iovec_concat(&acb->cur_qiov, acb->qiov, acb->qiov_offset, len);
 
     if (acb->flags & QED_AIOCB_ZERO) {
         /* Skip ahead if the clusters are already zero */
         if (acb->find_cluster_ret == QED_CLUSTER_ZERO) {
-            qed_aio_next_io(acb, 0);
-            return;
+            cb = qed_aio_next_io;
+        } else {
+            cb = qed_aio_write_zero_cluster;
         }
-
-        cb = qed_aio_write_zero_cluster;
     } else {
         cb = qed_aio_write_prefill;
         acb->cur_cluster = qed_alloc_clusters(s, acb->cur_nclusters);
     }
 
+    qed_prepare_qiov(acb, len);
     if (qed_should_set_need_check(s)) {
         s->header.features |= QED_F_NEED_CHECK;
         qed_write_header(s, cb, acb);
@@ -1158,19 +1187,8 @@ static void qed_aio_write_alloc(QEDAIOCB *acb, size_t len)
  */
 static void qed_aio_write_inplace(QEDAIOCB *acb, uint64_t offset, size_t len)
 {
-    /* Allocate buffer for zero writes */
-    if (acb->flags & QED_AIOCB_ZERO) {
-        struct iovec *iov = acb->qiov->iov;
-
-        if (!iov->iov_base) {
-            iov->iov_base = qemu_blockalign(acb->common.bs, iov->iov_len);
-            memset(iov->iov_base, 0, iov->iov_len);
-        }
-    }
-
-    /* Calculate the I/O vector */
     acb->cur_cluster = offset;
-    qemu_iovec_concat(&acb->cur_qiov, acb->qiov, acb->qiov_offset, len);
+    qed_prepare_qiov(acb, len);
 
     /* Do the actual write */
     qed_aio_write_main(acb, 0);
@@ -1270,6 +1288,7 @@ static void qed_aio_next_io(void *opaque, int ret)
 {
     QEDAIOCB *acb = opaque;
     BDRVQEDState *s = acb_to_s(acb);
+    uint64_t len;
     QEDFindClusterFunc *io_fn = (acb->flags & QED_AIOCB_WRITE) ?
                                 qed_aio_write_data : qed_aio_read_data;
 
@@ -1291,10 +1310,14 @@ static void qed_aio_next_io(void *opaque, int ret)
         return;
     }
 
+    /* Limit buffer size to one cluster when writing zeroes.  */
+    len = acb->end_pos - acb->cur_pos;
+    if (acb->flags & QED_AIOCB_ZERO) {
+        len = MIN(len, s->header.cluster_size);
+    }
+
     /* Find next cluster and start I/O */
-    qed_find_cluster(s, &acb->request,
-                      acb->cur_pos, acb->end_pos - acb->cur_pos,
-                      io_fn, acb);
+    qed_find_cluster(s, &acb->request, acb->cur_pos, len, io_fn, acb);
 }
 
 static BlockDriverAIOCB *qed_aio_setup(BlockDriverState *bs,
@@ -1315,7 +1338,7 @@ static BlockDriverAIOCB *qed_aio_setup(BlockDriverState *bs,
     acb->cur_pos = (uint64_t)sector_num * BDRV_SECTOR_SIZE;
     acb->end_pos = acb->cur_pos + nb_sectors * BDRV_SECTOR_SIZE;
     acb->request.l2_table = NULL;
-    qemu_iovec_init(&acb->cur_qiov, qiov->niov);
+    qemu_iovec_init(&acb->cur_qiov, qiov ? qiov->niov : 1);
 
     /* Start request */
     qed_aio_next_io(acb, 0);
@@ -1364,17 +1387,11 @@ static int coroutine_fn bdrv_qed_co_write_zeroes(BlockDriverState *bs,
 {
     BlockDriverAIOCB *blockacb;
     QEDWriteZeroesCB cb = { .done = false };
-    QEMUIOVector qiov;
-    struct iovec iov;
 
     /* Zero writes start without an I/O buffer.  If a buffer becomes necessary
      * then it will be allocated during request processing.
      */
-    iov.iov_base = NULL,
-    iov.iov_len  = nb_sectors * BDRV_SECTOR_SIZE,
-
-    qemu_iovec_init_external(&qiov, &iov, 1);
-    blockacb = qed_aio_setup(bs, sector_num, &qiov, nb_sectors,
+    blockacb = qed_aio_setup(bs, sector_num, NULL, nb_sectors,
                              qed_co_write_zeroes_cb, &cb,
                              QED_AIOCB_WRITE | QED_AIOCB_ZERO);
     if (!blockacb) {
-- 
1.7.11.2


>From c74de5ae7650233574fb1572bc3b463864af8a3e Mon Sep 17 00:00:00 2001
From: Paolo Bonzini <pbonz...@redhat.com>
Date: Tue, 28 Aug 2012 16:09:23 +0200
Subject: [PATCH 2/2] qed: do copy-on-write for the first and last cluster in
 a misaligned write-zero request

Signed-off-by: Paolo Bonzini <pbonz...@redhat.com>
---
 block/qed.c | 11 +++++++++++
 1 file modificato, 11 inserzioni(+)

diff --git a/block/qed.c b/block/qed.c
index bcea346..0f8d06c 100644
--- a/block/qed.c
+++ b/block/qed.c
@@ -1160,9 +1160,20 @@ static void qed_aio_write_alloc(QEDAIOCB *acb, size_t len)
         if (acb->find_cluster_ret == QED_CLUSTER_ZERO) {
             cb = qed_aio_next_io;
         } else {
+            /* For misaligned requests, do normal copy-on-write for the first
+             * and last cluster.
+             */
+            unsigned offset = qed_offset_into_cluster(s, acb->cur_pos);
+            if (offset != 0 || qed_offset_into_cluster(s, offset + len) != 0) {
+                acb->cur_nclusters = 1;
+                len = MIN(len, s->header.cluster_size - offset);
+                goto copy;
+            }
+
             cb = qed_aio_write_zero_cluster;
         }
     } else {
+copy:
         cb = qed_aio_write_prefill;
         acb->cur_cluster = qed_alloc_clusters(s, acb->cur_nclusters);
     }
-- 
1.7.11.2

Reply via email to