[PATCH v2 3/3] iotests, parallels: Add a test for duplicated clusters

2022-08-05 Thread alexander . ivanov
From: Alexander Ivanov 

Check if original and duplicated offsets refer to the same cluster.
Repair the image and check that writing to a referred cluster
doesn't affects another referred cluster.

Signed-off-by: Natalia Kuzmina 
Signed-off-by: Alexander Ivanov 
---
 tests/qemu-iotests/314|  89 ++
 tests/qemu-iotests/314.out|  36 +++
 .../parallels-2-duplicated-cluster.bz2| Bin 0 -> 148 bytes
 3 files changed, 125 insertions(+)
 create mode 100755 tests/qemu-iotests/314
 create mode 100644 tests/qemu-iotests/314.out
 create mode 100644 
tests/qemu-iotests/sample_images/parallels-2-duplicated-cluster.bz2

diff --git a/tests/qemu-iotests/314 b/tests/qemu-iotests/314
new file mode 100755
index 00..79b4d3a749
--- /dev/null
+++ b/tests/qemu-iotests/314
@@ -0,0 +1,89 @@
+#!/usr/bin/env bash
+# group: rw auto quick
+#
+# Test qemu-img check on duplicated clusters
+#
+# Copyright (c) 2022 Virtuozzo International GmbH
+#
+# This program is free software; you can redistribute it and/or modify
+# it under the terms of the GNU General Public License as published by
+# the Free Software Foundation; either version 2 of the License, or
+# (at your option) any later version.
+#
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+# GNU General Public License for more details.
+#
+# You should have received a copy of the GNU General Public License
+# along with this program.  If not, see .
+#
+
+# creator natalia.kuzm...@openvz.org
+
+owner=alexander.iva...@virtuozzo.com
+
+seq=`basename $0`
+echo "QA output created by $seq"
+
+status=1# failure is the default!
+
+_cleanup()
+{
+_cleanup_test_img
+}
+trap "_cleanup; exit \$status" 0 1 2 3 15
+
+# get standard environment, filters and checks
+. ./common.rc
+. ./common.filter
+. ./common.pattern
+
+_supported_fmt parallels
+_supported_proto file
+_supported_os Linux
+
+echo
+echo "using sample corrupted image"
+echo
+_use_sample_img parallels-2-duplicated-cluster.bz2
+
+CLUSTER_SIZE=65536
+
+#read one cluster from original offset
+$QEMU_IO -c "read -P 0x11 0 $CLUSTER_SIZE" "$TEST_IMG" | \
+_filter_qemu_io
+#read from duplicated offset (data must be the same as on original offset)
+$QEMU_IO -c "read -P 0x11 $((4 * CLUSTER_SIZE)) $CLUSTER_SIZE" "$TEST_IMG" | \
+_filter_qemu_io
+#change data from original offset
+$QEMU_IO -c "write -P 0x55 0 $CLUSTER_SIZE" "$TEST_IMG" | \
+_filter_qemu_io
+#read from duplicated offset (data must be the same as on original offset)
+$QEMU_IO -c "read -P 0x55 $((4 * CLUSTER_SIZE)) $CLUSTER_SIZE" "$TEST_IMG" | \
+_filter_qemu_io
+echo
+echo "check and repair the image for parallels format"
+echo
+_check_test_img -r all
+echo
+
+#read one cluster from original offset
+$QEMU_IO -c "read -P 0x55 0 $CLUSTER_SIZE" "$TEST_IMG" | \
+_filter_qemu_io
+#read copied data from new offset
+$QEMU_IO -c "read -P 0x55 $((4 * CLUSTER_SIZE)) $CLUSTER_SIZE" "$TEST_IMG" | \
+_filter_qemu_io
+#change data from original offset
+$QEMU_IO -c "write -P 0x11 0 $CLUSTER_SIZE" "$TEST_IMG" | \
+_filter_qemu_io
+#read from new offset (fail, now this data was left unchanged)
+$QEMU_IO -c "read -P 0x11 $((4 * CLUSTER_SIZE)) $CLUSTER_SIZE" "$TEST_IMG" | \
+_filter_qemu_io
+
+echo
+echo
+# success, all done
+echo "*** done"
+rm -f $seq.full
+status=0
diff --git a/tests/qemu-iotests/314.out b/tests/qemu-iotests/314.out
new file mode 100644
index 00..c36022c407
--- /dev/null
+++ b/tests/qemu-iotests/314.out
@@ -0,0 +1,36 @@
+QA output created by 314
+
+using sample corrupted image
+
+read 65536/65536 bytes at offset 0
+64 KiB, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec)
+read 65536/65536 bytes at offset 262144
+64 KiB, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec)
+wrote 65536/65536 bytes at offset 0
+64 KiB, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec)
+read 65536/65536 bytes at offset 262144
+64 KiB, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec)
+
+check and repair the image
+
+Repairing BAT offset in entry 4 duplicates offset in entry 0
+The following inconsistencies were found and repaired:
+
+0 leaked clusters
+1 corruptions
+
+Double checking the fixed image now...
+No errors were found on the image.
+
+read 65536/65536 bytes at offset 0
+64 KiB, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec)
+read 65536/65536 bytes at offset 262144
+64 KiB, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec)
+wrote 65536/65536 bytes at offset 0
+64 KiB, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec)
+Pattern verification failed at offset 262144, 65536 bytes
+read 65536/65536 bytes at offset 262144
+64 KiB, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec)
+
+
+*** done
diff --git 
a/tests/qemu-iotests/sample_images/parallels-2-duplicated-cluster.bz2 

[PATCH v2 2/3] parallels: Add checking and repairing duplicate offsets in BAT

2022-08-05 Thread alexander . ivanov
From: Alexander Ivanov 

There could be corruptions in the image file:
two guest memory areas refer to the same host cluster.

If a duplicate offset is found fix it by copying the content
of the referred cluster to a new allocated cluster and
replace one of the two referring entries by the new cluster offset.

Signed-off-by: Natalia Kuzmina 
Signed-off-by: Alexander Ivanov 
---
 block/parallels.c | 135 ++
 1 file changed, 135 insertions(+)

diff --git a/block/parallels.c b/block/parallels.c
index a0eb7ec3c3..73264b4bd1 100644
--- a/block/parallels.c
+++ b/block/parallels.c
@@ -64,6 +64,11 @@ static QEnumLookup prealloc_mode_lookup = {
 #define PARALLELS_OPT_PREALLOC_MODE "prealloc-mode"
 #define PARALLELS_OPT_PREALLOC_SIZE "prealloc-size"
 
+#define REVERSED_BAT_UNTOUCHED  0x
+
+#define HOST_CLUSTER_INDEX(s, off) \
+((off - ((s->header->data_off) << BDRV_SECTOR_BITS)) / (s->cluster_size))
+
 static QemuOptsList parallels_runtime_opts = {
 .name = "parallels",
 .head = QTAILQ_HEAD_INITIALIZER(parallels_runtime_opts.head),
@@ -559,6 +564,131 @@ static int check_leak(BlockDriverState *bs,
 return 0;
 }
 
+static int check_duplicate(BlockDriverState *bs,
+   BdrvCheckResult *res,
+   BdrvCheckMode fix)
+{
+BDRVParallelsState *s = bs->opaque;
+QEMUIOVector qiov;
+int64_t off, high_off, size, sector_num;
+uint32_t i, idx_host;
+int ret = 0, n;
+g_autofree uint32_t *reversed_bat = NULL;
+g_autofree int64_t *cluster_buf = NULL;
+bool sync_and_truncate = false;
+
+/*
+ * Make a reversed BAT.
+ * Each cluster in the host file could represent only one cluster
+ * from the guest point of view. Reversed BAT provides mapping of that 
type.
+ * Initially the table is filled with REVERSED_BAT_UNTOUCHED values.
+ */
+reversed_bat = g_malloc(s->bat_size * sizeof(uint32_t));
+for (i = 0; i < s->bat_size; i++) {
+reversed_bat[i] = REVERSED_BAT_UNTOUCHED;
+}
+
+cluster_buf = g_malloc(s->cluster_size);
+qemu_iovec_init(, 0);
+qemu_iovec_add(, cluster_buf, s->cluster_size);
+
+high_off = 0;
+for (i = 0; i < s->bat_size; i++) {
+off = bat2sect(s, i) << BDRV_SECTOR_BITS;
+if (off == 0) {
+continue;
+}
+
+if (off > high_off) {
+high_off = off;
+}
+
+idx_host = HOST_CLUSTER_INDEX(s, off);
+if (idx_host >= s->bat_size) {
+res->check_errors++;
+goto out;
+}
+
+if (reversed_bat[idx_host] != REVERSED_BAT_UNTOUCHED) {
+/*
+ * We have alreade written a guest cluster index for this
+ * host cluster. It means there is a duplicated offset in BAT.
+ */
+fprintf(stderr,
+"%s BAT offset in entry %u duplicates offset in entry 
%u\n",
+fix & BDRV_FIX_ERRORS ? "Repairing" : "ERROR",
+i, reversed_bat[idx_host]);
+res->corruptions++;
+
+if (fix & BDRV_FIX_ERRORS) {
+/*
+ * Write zero to the current BAT entry, allocate a new cluster
+ * for the relevant guest offset and copy the host cluster
+ * to the new allocated cluster.
+ * In this way mwe will have two identical clusters and two
+ * BAT entries with the offsets of these clusters.
+ */
+s->bat_bitmap[i] = 0;
+
+sector_num = bat2sect(s, reversed_bat[idx_host]);
+ret = bdrv_pread(bs->file, sector_num << BDRV_SECTOR_BITS,
+ s->cluster_size, cluster_buf, 0);
+if (ret < 0) {
+res->check_errors++;
+goto out;
+}
+
+sector_num = (i * s->cluster_size) >> BDRV_SECTOR_BITS;
+off = allocate_clusters(bs, sector_num, s->tracks, );
+if (off < 0) {
+res->check_errors++;
+ret = off;
+goto out;
+}
+off <<= BDRV_SECTOR_BITS;
+
+ret = bdrv_co_pwritev(bs->file, off, s->cluster_size, , 
0);
+if (ret < 0) {
+res->check_errors++;
+goto out;
+}
+
+/* off is new and we should repair idx_host accordingly. */
+idx_host = HOST_CLUSTER_INDEX(s, off);
+res->corruptions_fixed++;
+sync_and_truncate = true;
+}
+}
+reversed_bat[idx_host] = i;
+}
+
+if (sync_and_truncate) {
+ret = sync_header(bs, res);
+if (ret < 0) {
+goto out;
+}
+
+size = bdrv_getlength(bs->file->bs);
+if (size < 0) {
+res->check_errors++;
+  

[PATCH v2 1/3] parallels: Put the image checks in separate functions

2022-08-05 Thread alexander . ivanov
From: Alexander Ivanov 

We will add more and more checks of images so we need to reorganize the code.
Put each check to a separate helper function with a separate loop.
Add two helpers: truncate_file() and sync_header(). They will be used
in multiple functions.

Signed-off-by: Alexander Ivanov 
---
 block/parallels.c | 180 ++
 1 file changed, 133 insertions(+), 47 deletions(-)

diff --git a/block/parallels.c b/block/parallels.c
index a229c06f25..a0eb7ec3c3 100644
--- a/block/parallels.c
+++ b/block/parallels.c
@@ -413,24 +413,41 @@ static coroutine_fn int 
parallels_co_readv(BlockDriverState *bs,
 return ret;
 }
 
-
-static int coroutine_fn parallels_co_check(BlockDriverState *bs,
-   BdrvCheckResult *res,
-   BdrvCheckMode fix)
+static int sync_header(BlockDriverState *bs, BdrvCheckResult *res)
 {
 BDRVParallelsState *s = bs->opaque;
-int64_t size, prev_off, high_off;
 int ret;
-uint32_t i;
-bool flush_bat = false;
 
-size = bdrv_getlength(bs->file->bs);
-if (size < 0) {
+ret = bdrv_co_pwrite_sync(bs->file, 0, s->header_size, s->header, 0);
+if (ret < 0) {
 res->check_errors++;
-return size;
 }
+return ret;
+}
+
+static int truncate_file(BlockDriverState *bs, int64_t size)
+{
+Error *local_err = NULL;
+int ret;
+
+/*
+ * In order to really repair the image, we must shrink it.
+ * That means we have to pass exact=true.
+ */
+ret = bdrv_truncate(bs->file, size, true,
+PREALLOC_MODE_OFF, 0, _err);
+if (ret < 0) {
+error_report_err(local_err);
+}
+return ret;
+}
+
+static void check_unclean(BlockDriverState *bs,
+  BdrvCheckResult *res,
+  BdrvCheckMode fix)
+{
+BDRVParallelsState *s = bs->opaque;
 
-qemu_co_mutex_lock(>lock);
 if (s->header_unclean) {
 fprintf(stderr, "%s image was not closed correctly\n",
 fix & BDRV_FIX_ERRORS ? "Repairing" : "ERROR");
@@ -441,78 +458,147 @@ static int coroutine_fn 
parallels_co_check(BlockDriverState *bs,
 s->header_unclean = false;
 }
 }
+}
 
-res->bfi.total_clusters = s->bat_size;
-res->bfi.compressed_clusters = 0; /* compression is not supported */
+static int check_cluster_outside_image(BlockDriverState *bs,
+   BdrvCheckResult *res,
+   BdrvCheckMode fix)
+{
+BDRVParallelsState *s = bs->opaque;
+int64_t size;
+uint32_t i;
+bool sync = false;
+
+size = bdrv_getlength(bs->file->bs);
+if (size < 0) {
+res->check_errors++;
+return size;
+}
 
-high_off = 0;
-prev_off = 0;
 for (i = 0; i < s->bat_size; i++) {
-int64_t off = bat2sect(s, i) << BDRV_SECTOR_BITS;
-if (off == 0) {
-prev_off = 0;
-continue;
-}
-
-/* cluster outside the image */
-if (off > size) {
+if ((bat2sect(s, i) << BDRV_SECTOR_BITS) > size) {
 fprintf(stderr, "%s cluster %u is outside image\n",
 fix & BDRV_FIX_ERRORS ? "Repairing" : "ERROR", i);
 res->corruptions++;
 if (fix & BDRV_FIX_ERRORS) {
-prev_off = 0;
 s->bat_bitmap[i] = 0;
 res->corruptions_fixed++;
-flush_bat = true;
-continue;
+sync = true;
 }
 }
+}
 
-res->bfi.allocated_clusters++;
-if (off > high_off) {
-high_off = off;
+if (sync) {
+return sync_header(bs, res);
+}
+
+return 0;
+}
+
+static void check_fragmentation(BlockDriverState *bs,
+BdrvCheckResult *res,
+BdrvCheckMode fix)
+{
+BDRVParallelsState *s = bs->opaque;
+int64_t off, prev_off;
+uint32_t i;
+
+prev_off = 0;
+for (i = 0; i < s->bat_size; i++) {
+off = bat2sect(s, i) << BDRV_SECTOR_BITS;
+if (off == 0) {
+prev_off = 0;
+continue;
 }
-
 if (prev_off != 0 && (prev_off + s->cluster_size) != off) {
 res->bfi.fragmented_clusters++;
 }
 prev_off = off;
 }
+}
 
-ret = 0;
-if (flush_bat) {
-ret = bdrv_co_pwrite_sync(bs->file, 0, s->header_size, s->header, 0);
-if (ret < 0) {
-res->check_errors++;
-goto out;
+static int check_leak(BlockDriverState *bs,
+  BdrvCheckResult *res,
+  BdrvCheckMode fix)
+{
+BDRVParallelsState *s = bs->opaque;
+int64_t size, off, high_off, count;
+uint32_t i;
+
+size = bdrv_getlength(bs->file->bs);
+if (size < 0) {
+res->check_errors++;
+return size;
+}
+
+high_off = 0;
+for 

[PATCH v2 0/3] Check and repair duplicated clusters in parallels images

2022-08-05 Thread alexander . ivanov
From: Alexander Ivanov 

We will add more and more checks of images so we need to reorganize the code.
Put each check to a separate helper function with a separate loop.
Add two helpers: truncate_file() and sync_header(). They will be used
in multiple functions.

Parallels image file can be corrupted this way: two guest memory areas
refer to the same host memory area (duplicated offsets in BAT).
qemu-img check copies data from duplicated cluster to the new cluster and
writes new corresponding offset to BAT instead of duplicated one.

Test 314 uses sample corrupted image parallels-2-duplicated-cluster.bz2.
Reading from duplicated offset and from original offset returns the same
data. After repairing changing either of these blocks of data
does not affect another one.

v2 changes:
 * Split parallels_co_check() to separate functions.
 * Move buffer allocation outside the loop.
 * Replace work with internals by zeroing BAT entries and
   allocate_clusters() calling.
 * Make reverse table unsigned and replace -1 by 0x.
 * Some minor fixes.
 * Add more detailed comments.

Alexander Ivanov (3):
  parallels: Put the image checks in separate functions
  parallels: Add checking and repairing duplicate offsets in BAT
  iotests, parallels: Add a test for duplicated clusters

 block/parallels.c | 315 +++---
 tests/qemu-iotests/314|  89 +
 tests/qemu-iotests/314.out|  36 ++
 .../parallels-2-duplicated-cluster.bz2| Bin 0 -> 148 bytes
 4 files changed, 393 insertions(+), 47 deletions(-)
 create mode 100755 tests/qemu-iotests/314
 create mode 100644 tests/qemu-iotests/314.out
 create mode 100644 
tests/qemu-iotests/sample_images/parallels-2-duplicated-cluster.bz2

-- 
2.34.1




Re: [PATCH v2 00/11] Refactor bdrv_try_set_aio_context using transactions

2022-08-05 Thread Vladimir Sementsov-Ogievskiy

On 8/5/22 16:36, Emanuele Giuseppe Esposito wrote:



Am 05/08/2022 um 15:22 schrieb Emanuele Giuseppe Esposito:



Am 27/07/2022 um 18:13 schrieb Vladimir Sementsov-Ogievskiy:

On 7/25/22 15:21, Emanuele Giuseppe Esposito wrote:

The aim of this series is to reorganize bdrv_try_set_aio_context
and drop BDS ->set_aio_context and ->can_set_aio_ctx callbacks in
favour of a new one, ->change_aio_ctx.

More informations in patch 3 (which is also RFC, due to the doubts
I have with AioContext locks).

Patch 1 just add assertions in the code, 2 extends the transactions
API to be able to add also transactions in the tail
of the list.
Patch 3 is the core of this series, and introduces the new callback.
It is marked as RFC and the reason is explained in the commit message.
Patches 4-5-6 implement ->change_aio_ctx in the various block, blockjob
and block-backend BDSes.
Patch 7 substitutes ->change_aio_ctx with the old callbacks, and
patch 8 takes care of deleting the old callbacks and unused code.

This series is based on "job: replace AioContext lock with job_mutex",
but just because it uses job_set_aio_context() introduced there.

Suggested-by: Paolo Bonzini
Based-on:<20220706201533.289775-1-eespo...@redhat.com>





So, I read your email before going on PTO and at that point I got what
your concerns were, but now after re-reading it I don't understand
anymore what you mean :)


What I dislike here is that you refactor aio-context-change to use
transactions, but you use it "internally" for aio-context-change.
aio-context-change doesn't become a native part of block-graph
modifiction transaction system after the series.

For example, bdrv_attach_child_common(..., tran) still calls
bdrv_try_change_aio_context() function which doesn't take @tran
argument. And we have to call bdrv_try_change_aio_context() again in
bdrv_attach_child_common_abort() with opposite arguments by hand. We
create in _try_ separate Transaction object which is unrelated to the
original block-graph-change transaction.



This can be fixed: patch 4 "bdrv_child_try_change_aio_context: add
transaction parameter" supports taking transaction as a parameter.
bdrv_attach_child_common could simply call
bdrv_try_change_aio_context_tran (ok we need to deal with locking, but
it could work).


No actually I don't get how it can work in bdrv_attach_child_common.
We have the following:

parent_ctx = bdrv_child_get_parent_aio_context(new_child);
if (child_ctx != parent_ctx) {
   int ret = bdrv_try_change_aio_context(child_bs, parent_ctx, NULL,
 _err);

   if (ret < 0 && child_class->change_aio_ctx) {
   ret_child = child_class->change_aio_ctx(new_child, child_ctx,
   visited, tran, NULL);

   tran_finalize(tran, ret_child == true ? 0 : -1);
   }

   if (ret < 0) {
   return ret;
   }
}

bdrv_replace_child_noperm(_child, child_bs, true);

So bdrv_try_change_aio_context would be changed in
bdrv_try_change_aio_context_tran, but then how can we call
bdrv_replace_child_noperm if no aiocontext has been changed at all?
I don't think we can mix transactional operations with non-transactional.



So here, bdrv_try_change_aio_context() is .prepare in the way I mean.

And than in .abort we call bdrv_try_change_aio_context() again but with reverse argument, 
and it's a kind of ".abort".

Probably, we can refactor that, making a function bdrv_change_aio_context(, .., 
tran), which does what bdrv_try_change_aio_context does, and registers .abort 
callback, that will simulate calling bdrv_try_change_aio_context() with 
opposite arguement.  But we should carefully refactor all the function names 
and avoid having nested transaction.





I think the main concern here is that during the prepare phase this
serie doesn't change any aiocontext, so until we don't commit the rest
of the code cannot assume that the aiocontext has been changed.

But isn't it what happens also for permissions? Permission functions
like bdrv_drv_set_perm perform bdrv_check_perm() in .prepare(), and then
bdrv_set_perm() in commit.

Another important question is that if we actually want to put everything
in a single transaction, because .prepare functions like the one
proposed here perform drains, so the logic following prepare and
preceding commit must take into account that everything is drained. Also
prepare itself has to take into account that maybe other .prepare took
locks or drained themselves...


With good refactoring we should get rid of these _try_ functions, and
have just bdrv_change_aio_context(..., tran) that can be natively used
with external tran object, where other block-graph change actions
participate. This way we'll not have to call reverse
aio_context_change() in .abort, it will be done automatically.

Moreover, your  *aio_context_change* functions that do have tran
parameter cannot be simply used in the block-graph change transaction,
as you don't 

Re: [PATCH 2/2] util/aio-win32: Correct the event array size in aio_poll()

2022-08-05 Thread Stefan Weil via

Am 05.08.22 um 16:56 schrieb Bin Meng:


From: Bin Meng 

WaitForMultipleObjects() can only wait for MAXIMUM_WAIT_OBJECTS
object handles. Correct the event array size in aio_poll() and
add a assert() to ensure it does not cause out of bound access.

Signed-off-by: Bin Meng 
---

  util/aio-win32.c | 3 ++-
  1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/util/aio-win32.c b/util/aio-win32.c
index 44003d645e..8cf5779567 100644
--- a/util/aio-win32.c
+++ b/util/aio-win32.c
@@ -326,7 +326,7 @@ void aio_dispatch(AioContext *ctx)
  bool aio_poll(AioContext *ctx, bool blocking)
  {
  AioHandler *node;
-HANDLE events[MAXIMUM_WAIT_OBJECTS + 1];
+HANDLE events[MAXIMUM_WAIT_OBJECTS];
  bool progress, have_select_revents, first;
  int count;
  int timeout;
@@ -369,6 +369,7 @@ bool aio_poll(AioContext *ctx, bool blocking)
  QLIST_FOREACH_RCU(node, >aio_handlers, node) {
  if (!node->deleted && node->io_notify
  && aio_node_check(ctx, node->is_external)) {
+assert(count < MAXIMUM_WAIT_OBJECTS);



Would using g_assert for new code be better? Currently the rest of that 
file (and most QEMU code) uses assert.


count could also be changed from int to unsigned (which matches better 
to the unsigned DWORD).


Reviewed-by: Stefan Weil 





Re: [PATCH v2 00/11] Refactor bdrv_try_set_aio_context using transactions

2022-08-05 Thread Emanuele Giuseppe Esposito



Am 05/08/2022 um 16:35 schrieb Vladimir Sementsov-Ogievskiy:
> On 8/5/22 16:22, Emanuele Giuseppe Esposito wrote:
>>
>>
>> Am 27/07/2022 um 18:13 schrieb Vladimir Sementsov-Ogievskiy:
>>> On 7/25/22 15:21, Emanuele Giuseppe Esposito wrote:
 The aim of this series is to reorganize bdrv_try_set_aio_context
 and drop BDS ->set_aio_context and ->can_set_aio_ctx callbacks in
 favour of a new one, ->change_aio_ctx.

 More informations in patch 3 (which is also RFC, due to the doubts
 I have with AioContext locks).

 Patch 1 just add assertions in the code, 2 extends the transactions
 API to be able to add also transactions in the tail
 of the list.
 Patch 3 is the core of this series, and introduces the new callback.
 It is marked as RFC and the reason is explained in the commit message.
 Patches 4-5-6 implement ->change_aio_ctx in the various block, blockjob
 and block-backend BDSes.
 Patch 7 substitutes ->change_aio_ctx with the old callbacks, and
 patch 8 takes care of deleting the old callbacks and unused code.

 This series is based on "job: replace AioContext lock with job_mutex",
 but just because it uses job_set_aio_context() introduced there.

 Suggested-by: Paolo Bonzini
 Based-on:<20220706201533.289775-1-eespo...@redhat.com>
>>>
>>>
>>
>> So, I read your email before going on PTO and at that point I got what
>> your concerns were, but now after re-reading it I don't understand
>> anymore what you mean :)
>>
>>> What I dislike here is that you refactor aio-context-change to use
>>> transactions, but you use it "internally" for aio-context-change.
>>> aio-context-change doesn't become a native part of block-graph
>>> modifiction transaction system after the series.
>>>
>>> For example, bdrv_attach_child_common(..., tran) still calls
>>> bdrv_try_change_aio_context() function which doesn't take @tran
>>> argument. And we have to call bdrv_try_change_aio_context() again in
>>> bdrv_attach_child_common_abort() with opposite arguments by hand. We
>>> create in _try_ separate Transaction object which is unrelated to the
>>> original block-graph-change transaction.
>>>
>>
>> This can be fixed: patch 4 "bdrv_child_try_change_aio_context: add
>> transaction parameter" supports taking transaction as a parameter.
>> bdrv_attach_child_common could simply call
>> bdrv_try_change_aio_context_tran (ok we need to deal with locking, but
>> it could work).
>>
>> I think the main concern here is that during the prepare phase this
>> serie doesn't change any aiocontext, so until we don't commit the rest
>> of the code cannot assume that the aiocontext has been changed.
>>
>> But isn't it what happens also for permissions? Permission functions
>> like bdrv_drv_set_perm perform bdrv_check_perm() in .prepare(), and then
>> bdrv_set_perm() in commit.
> 
> Not exactly.
> 
> Partly that's just old bad naming. Ideally, driver handlers should be
> refactored to have one
> .bdrv_set_perm(, ... tran, errp) handler. Or at least renamed to
> .prepare and .commit instead of misleading .check and .set.
> 
> Secondly what is important, is that corresponding .set actions are not
> visible to other block-graph modifying actions (like taking locks on fd.
> other actions, like attach/detach children don't care of it)/ (Or, at
> least, they _shoud not be_ visible :) To be honest, I don't have real
> expertise, of how much these .set actions are visible or not by other
> block-graph modifying actions, but I believe that we are OK in common
> scenarios).
> 
> What is really visible at generic block layer? Visible is change of
> .perm / .shared_perm fields of BdrvChild. And they are set in .prepare,
> look at bdrv_child_set_perm().
> 
> So, after calling bdrv_child_set_perm() other actions of .prepare stage
> will see _updated_ permissions. And if transaction fails, we rollback
> .perm and .shared_perm fields in bdrv_child_set_perm_abort()
> 
> 
>>
>> Another important question is that if we actually want to put everything
>> in a single transaction, because .prepare functions like the one
>> proposed here perform drains, so the logic following prepare and
>> preceding commit must take into account that everything is drained. Also
>> prepare itself has to take into account that maybe other .prepare took
>> locks or drained themselves...
> 
> Yes, untie the knot of drained sections during aio context change is a
> challenge.. And that's (at least on of) the reason why aio-context
> change is still not a native part of block graph modifying transactions.
> 
> Could there be some simple solution?
> 
> Like
> 
> 1. drain the whole subgraph of all nodes connected with nodes that we
> are going to touch
> 
> 2. do block graph modifying transaction (including aio context change)
> knowing that everything we need is drained. (so we don't have to care
> about drain during aio context change)
> 
> 3. undrain the subgraph
> 
> In other words, is that really 

[PATCH 2/2] util/aio-win32: Correct the event array size in aio_poll()

2022-08-05 Thread Bin Meng
From: Bin Meng 

WaitForMultipleObjects() can only wait for MAXIMUM_WAIT_OBJECTS
object handles. Correct the event array size in aio_poll() and
add a assert() to ensure it does not cause out of bound access.

Signed-off-by: Bin Meng 
---

 util/aio-win32.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/util/aio-win32.c b/util/aio-win32.c
index 44003d645e..8cf5779567 100644
--- a/util/aio-win32.c
+++ b/util/aio-win32.c
@@ -326,7 +326,7 @@ void aio_dispatch(AioContext *ctx)
 bool aio_poll(AioContext *ctx, bool blocking)
 {
 AioHandler *node;
-HANDLE events[MAXIMUM_WAIT_OBJECTS + 1];
+HANDLE events[MAXIMUM_WAIT_OBJECTS];
 bool progress, have_select_revents, first;
 int count;
 int timeout;
@@ -369,6 +369,7 @@ bool aio_poll(AioContext *ctx, bool blocking)
 QLIST_FOREACH_RCU(node, >aio_handlers, node) {
 if (!node->deleted && node->io_notify
 && aio_node_check(ctx, node->is_external)) {
+assert(count < MAXIMUM_WAIT_OBJECTS);
 events[count++] = event_notifier_get_handle(node->e);
 }
 }
-- 
2.34.1




Re: [PATCH v2 00/11] Refactor bdrv_try_set_aio_context using transactions

2022-08-05 Thread Vladimir Sementsov-Ogievskiy

On 8/5/22 16:22, Emanuele Giuseppe Esposito wrote:



Am 27/07/2022 um 18:13 schrieb Vladimir Sementsov-Ogievskiy:

On 7/25/22 15:21, Emanuele Giuseppe Esposito wrote:

The aim of this series is to reorganize bdrv_try_set_aio_context
and drop BDS ->set_aio_context and ->can_set_aio_ctx callbacks in
favour of a new one, ->change_aio_ctx.

More informations in patch 3 (which is also RFC, due to the doubts
I have with AioContext locks).

Patch 1 just add assertions in the code, 2 extends the transactions
API to be able to add also transactions in the tail
of the list.
Patch 3 is the core of this series, and introduces the new callback.
It is marked as RFC and the reason is explained in the commit message.
Patches 4-5-6 implement ->change_aio_ctx in the various block, blockjob
and block-backend BDSes.
Patch 7 substitutes ->change_aio_ctx with the old callbacks, and
patch 8 takes care of deleting the old callbacks and unused code.

This series is based on "job: replace AioContext lock with job_mutex",
but just because it uses job_set_aio_context() introduced there.

Suggested-by: Paolo Bonzini
Based-on:<20220706201533.289775-1-eespo...@redhat.com>





So, I read your email before going on PTO and at that point I got what
your concerns were, but now after re-reading it I don't understand
anymore what you mean :)


What I dislike here is that you refactor aio-context-change to use
transactions, but you use it "internally" for aio-context-change.
aio-context-change doesn't become a native part of block-graph
modifiction transaction system after the series.

For example, bdrv_attach_child_common(..., tran) still calls
bdrv_try_change_aio_context() function which doesn't take @tran
argument. And we have to call bdrv_try_change_aio_context() again in
bdrv_attach_child_common_abort() with opposite arguments by hand. We
create in _try_ separate Transaction object which is unrelated to the
original block-graph-change transaction.



This can be fixed: patch 4 "bdrv_child_try_change_aio_context: add
transaction parameter" supports taking transaction as a parameter.
bdrv_attach_child_common could simply call
bdrv_try_change_aio_context_tran (ok we need to deal with locking, but
it could work).

I think the main concern here is that during the prepare phase this
serie doesn't change any aiocontext, so until we don't commit the rest
of the code cannot assume that the aiocontext has been changed.

But isn't it what happens also for permissions? Permission functions
like bdrv_drv_set_perm perform bdrv_check_perm() in .prepare(), and then
bdrv_set_perm() in commit.


Not exactly.

Partly that's just old bad naming. Ideally, driver handlers should be 
refactored to have one
.bdrv_set_perm(, ... tran, errp) handler. Or at least renamed to .prepare and 
.commit instead of misleading .check and .set.

Secondly what is important, is that corresponding .set actions are not visible 
to other block-graph modifying actions (like taking locks on fd. other actions, 
like attach/detach children don't care of it)/ (Or, at least, they _shoud not 
be_ visible :) To be honest, I don't have real expertise, of how much these 
.set actions are visible or not by other block-graph modifying actions, but I 
believe that we are OK in common scenarios).

What is really visible at generic block layer? Visible is change of .perm / 
.shared_perm fields of BdrvChild. And they are set in .prepare, look at 
bdrv_child_set_perm().

So, after calling bdrv_child_set_perm() other actions of .prepare stage will 
see _updated_ permissions. And if transaction fails, we rollback .perm and 
.shared_perm fields in bdrv_child_set_perm_abort()




Another important question is that if we actually want to put everything
in a single transaction, because .prepare functions like the one
proposed here perform drains, so the logic following prepare and
preceding commit must take into account that everything is drained. Also
prepare itself has to take into account that maybe other .prepare took
locks or drained themselves...


Yes, untie the knot of drained sections during aio context change is a 
challenge.. And that's (at least on of) the reason why aio-context change is 
still not a native part of block graph modifying transactions.

Could there be some simple solution?

Like

1. drain the whole subgraph of all nodes connected with nodes that we are going 
to touch

2. do block graph modifying transaction (including aio context change) knowing 
that everything we need is drained. (so we don't have to care about drain 
during aio context change)

3. undrain the subgraph

In other words, is that really necessary to lock/unlock different contexts 
during the aio-context-change procedure? Or we can move to a lot larger and 
simpler drained section?




With good refactoring we should get rid of these _try_ functions, and
have just bdrv_change_aio_context(..., tran) that can be natively used
with external tran object, where other block-graph change actions
participate. This 

Re: [PATCH v2 00/11] Refactor bdrv_try_set_aio_context using transactions

2022-08-05 Thread Emanuele Giuseppe Esposito



Am 05/08/2022 um 15:22 schrieb Emanuele Giuseppe Esposito:
> 
> 
> Am 27/07/2022 um 18:13 schrieb Vladimir Sementsov-Ogievskiy:
>> On 7/25/22 15:21, Emanuele Giuseppe Esposito wrote:
>>> The aim of this series is to reorganize bdrv_try_set_aio_context
>>> and drop BDS ->set_aio_context and ->can_set_aio_ctx callbacks in
>>> favour of a new one, ->change_aio_ctx.
>>>
>>> More informations in patch 3 (which is also RFC, due to the doubts
>>> I have with AioContext locks).
>>>
>>> Patch 1 just add assertions in the code, 2 extends the transactions
>>> API to be able to add also transactions in the tail
>>> of the list.
>>> Patch 3 is the core of this series, and introduces the new callback.
>>> It is marked as RFC and the reason is explained in the commit message.
>>> Patches 4-5-6 implement ->change_aio_ctx in the various block, blockjob
>>> and block-backend BDSes.
>>> Patch 7 substitutes ->change_aio_ctx with the old callbacks, and
>>> patch 8 takes care of deleting the old callbacks and unused code.
>>>
>>> This series is based on "job: replace AioContext lock with job_mutex",
>>> but just because it uses job_set_aio_context() introduced there.
>>>
>>> Suggested-by: Paolo Bonzini
>>> Based-on:<20220706201533.289775-1-eespo...@redhat.com>
>>
>>
> 
> So, I read your email before going on PTO and at that point I got what
> your concerns were, but now after re-reading it I don't understand
> anymore what you mean :)
> 
>> What I dislike here is that you refactor aio-context-change to use
>> transactions, but you use it "internally" for aio-context-change.
>> aio-context-change doesn't become a native part of block-graph
>> modifiction transaction system after the series.
>>
>> For example, bdrv_attach_child_common(..., tran) still calls
>> bdrv_try_change_aio_context() function which doesn't take @tran
>> argument. And we have to call bdrv_try_change_aio_context() again in
>> bdrv_attach_child_common_abort() with opposite arguments by hand. We
>> create in _try_ separate Transaction object which is unrelated to the
>> original block-graph-change transaction.
>>
> 
> This can be fixed: patch 4 "bdrv_child_try_change_aio_context: add
> transaction parameter" supports taking transaction as a parameter.
> bdrv_attach_child_common could simply call
> bdrv_try_change_aio_context_tran (ok we need to deal with locking, but
> it could work).

No actually I don't get how it can work in bdrv_attach_child_common.
We have the following:

parent_ctx = bdrv_child_get_parent_aio_context(new_child);
if (child_ctx != parent_ctx) {
  int ret = bdrv_try_change_aio_context(child_bs, parent_ctx, NULL,
_err);

  if (ret < 0 && child_class->change_aio_ctx) {
  ret_child = child_class->change_aio_ctx(new_child, child_ctx,
  visited, tran, NULL);

  tran_finalize(tran, ret_child == true ? 0 : -1);
  }

  if (ret < 0) {
  return ret;
  }
}

bdrv_replace_child_noperm(_child, child_bs, true);

So bdrv_try_change_aio_context would be changed in
bdrv_try_change_aio_context_tran, but then how can we call
bdrv_replace_child_noperm if no aiocontext has been changed at all?
I don't think we can mix transactional operations with non-transactional.

Emanuele

> 
> I think the main concern here is that during the prepare phase this
> serie doesn't change any aiocontext, so until we don't commit the rest
> of the code cannot assume that the aiocontext has been changed.
> 
> But isn't it what happens also for permissions? Permission functions
> like bdrv_drv_set_perm perform bdrv_check_perm() in .prepare(), and then
> bdrv_set_perm() in commit.
> 
> Another important question is that if we actually want to put everything
> in a single transaction, because .prepare functions like the one
> proposed here perform drains, so the logic following prepare and
> preceding commit must take into account that everything is drained. Also
> prepare itself has to take into account that maybe other .prepare took
> locks or drained themselves...
> 
>> With good refactoring we should get rid of these _try_ functions, and
>> have just bdrv_change_aio_context(..., tran) that can be natively used
>> with external tran object, where other block-graph change actions
>> participate. This way we'll not have to call reverse
>> aio_context_change() in .abort, it will be done automatically.
>>
>> Moreover, your  *aio_context_change* functions that do have tran
>> parameter cannot be simply used in the block-graph change transaction,
>> as you don't follow the common paradigm, that in .prepare we do all
>> visible changes. That's why you have to still use _try_*() version that
>> creates seaparate transaction object and completes it: after that the
>> action is actually done and other graph-modifying actions can be done on
>> top.
>>
>> So, IMHO, we shouldn't go this way, as that adds transaction actions
>> that actually can't be used in 

Re: [PATCH v2 00/11] Refactor bdrv_try_set_aio_context using transactions

2022-08-05 Thread Emanuele Giuseppe Esposito



Am 27/07/2022 um 18:13 schrieb Vladimir Sementsov-Ogievskiy:
> On 7/25/22 15:21, Emanuele Giuseppe Esposito wrote:
>> The aim of this series is to reorganize bdrv_try_set_aio_context
>> and drop BDS ->set_aio_context and ->can_set_aio_ctx callbacks in
>> favour of a new one, ->change_aio_ctx.
>>
>> More informations in patch 3 (which is also RFC, due to the doubts
>> I have with AioContext locks).
>>
>> Patch 1 just add assertions in the code, 2 extends the transactions
>> API to be able to add also transactions in the tail
>> of the list.
>> Patch 3 is the core of this series, and introduces the new callback.
>> It is marked as RFC and the reason is explained in the commit message.
>> Patches 4-5-6 implement ->change_aio_ctx in the various block, blockjob
>> and block-backend BDSes.
>> Patch 7 substitutes ->change_aio_ctx with the old callbacks, and
>> patch 8 takes care of deleting the old callbacks and unused code.
>>
>> This series is based on "job: replace AioContext lock with job_mutex",
>> but just because it uses job_set_aio_context() introduced there.
>>
>> Suggested-by: Paolo Bonzini
>> Based-on:<20220706201533.289775-1-eespo...@redhat.com>
> 
> 

So, I read your email before going on PTO and at that point I got what
your concerns were, but now after re-reading it I don't understand
anymore what you mean :)

> What I dislike here is that you refactor aio-context-change to use
> transactions, but you use it "internally" for aio-context-change.
> aio-context-change doesn't become a native part of block-graph
> modifiction transaction system after the series.
> 
> For example, bdrv_attach_child_common(..., tran) still calls
> bdrv_try_change_aio_context() function which doesn't take @tran
> argument. And we have to call bdrv_try_change_aio_context() again in
> bdrv_attach_child_common_abort() with opposite arguments by hand. We
> create in _try_ separate Transaction object which is unrelated to the
> original block-graph-change transaction.
> 

This can be fixed: patch 4 "bdrv_child_try_change_aio_context: add
transaction parameter" supports taking transaction as a parameter.
bdrv_attach_child_common could simply call
bdrv_try_change_aio_context_tran (ok we need to deal with locking, but
it could work).

I think the main concern here is that during the prepare phase this
serie doesn't change any aiocontext, so until we don't commit the rest
of the code cannot assume that the aiocontext has been changed.

But isn't it what happens also for permissions? Permission functions
like bdrv_drv_set_perm perform bdrv_check_perm() in .prepare(), and then
bdrv_set_perm() in commit.

Another important question is that if we actually want to put everything
in a single transaction, because .prepare functions like the one
proposed here perform drains, so the logic following prepare and
preceding commit must take into account that everything is drained. Also
prepare itself has to take into account that maybe other .prepare took
locks or drained themselves...

> With good refactoring we should get rid of these _try_ functions, and
> have just bdrv_change_aio_context(..., tran) that can be natively used
> with external tran object, where other block-graph change actions
> participate. This way we'll not have to call reverse
> aio_context_change() in .abort, it will be done automatically.
> 
> Moreover, your  *aio_context_change* functions that do have tran
> parameter cannot be simply used in the block-graph change transaction,
> as you don't follow the common paradigm, that in .prepare we do all
> visible changes. That's why you have to still use _try_*() version that
> creates seaparate transaction object and completes it: after that the
> action is actually done and other graph-modifying actions can be done on
> top.
> 
> So, IMHO, we shouldn't go this way, as that adds transaction actions
> that actually can't be used in common block-graph-modifying transaction
> but only context of bdrv_try_change_aio_context() internal transaction.
> 
> I agree that aio-context change should finally be rewritten to take a
> native place in block-graph transactions, but that should be a complete
> solution, introducing native bdrv_change_aio_context(..., tran)
> transaction action that is directly used in the block-graph transaction,
> do visible effect in .prepare and don't create extra Transaction objects.
> 




Re: [PATCH v10 20/21] blockjob: remove unused functions

2022-08-05 Thread Kevin Wolf
Am 25.07.2022 um 09:38 hat Emanuele Giuseppe Esposito geschrieben:
> These public functions are not used anywhere, thus can be dropped.
> 
> Signed-off-by: Emanuele Giuseppe Esposito 
> Reviewed-by: Stefan Hajnoczi 

> @@ -113,6 +111,7 @@ BlockJob *block_job_next_locked(BlockJob *job);
>   * Get the block job identified by @id (which must not be %NULL).
>   *
>   * Returns the requested job, or %NULL if it doesn't exist.
> + * Called with job lock *not* held.
>   */
>  BlockJob *block_job_get(const char *id);

Unrelated hunk. Whatever:

Reviewed-by: Kevin Wolf 




Re: [PATCH v10 21/21] job: remove unused functions

2022-08-05 Thread Kevin Wolf
Am 25.07.2022 um 09:38 hat Emanuele Giuseppe Esposito geschrieben:
> These public functions are not used anywhere, thus can be dropped.
> Also, since this is the final job API that doesn't use AioContext
> lock and replaces it with job_lock, adjust all remaining function
> documentation to clearly specify if the job lock is taken or not.
> 
> Signed-off-by: Emanuele Giuseppe Esposito 
> Reviewed-by: Stefan Hajnoczi 

You're also documenting the locking requirements for a few functions
where you don't remove a second version.

Reviewed-by: Kevin Wolf 




Re: [PATCH v10 19/21] block_job_query: remove atomic read

2022-08-05 Thread Kevin Wolf
Am 25.07.2022 um 09:38 hat Emanuele Giuseppe Esposito geschrieben:
> Not sure what the atomic here was supposed to do, since job.busy
> is protected by the job lock. Since the whole function
> is called under job_mutex, just remove the atomic.
> 
> Reviewed-by: Vladimir Sementsov-Ogievskiy 
> Reviewed-by: Stefan Hajnoczi 
> Signed-off-by: Emanuele Giuseppe Esposito 

Reviewed-by: Kevin Wolf 




Re: [PATCH v10 18/21] job.c: enable job lock/unlock and remove Aiocontext locks

2022-08-05 Thread Kevin Wolf
Am 25.07.2022 um 09:38 hat Emanuele Giuseppe Esposito geschrieben:
> Change the job_{lock/unlock} and macros to use job_mutex.
> 
> Now that they are not nop anymore, remove the aiocontext
> to avoid deadlocks.

Okay, so this is the big bad patch where we need to verify the
completeness of all changes made so far. Let's see...

> Therefore:
> - when possible, remove completely the aiocontext lock/unlock pair
> - if it is used by some other function too, reduce the locking
>   section as much as possible, leaving the job API outside.
> - change AIO_WAIT_WHILE in AIO_WAIT_WHILE_UNLOCKED, since we
>   are not using the aiocontext lock anymore

Does this imply that there is a new rule that job_*() must not be called
with the AioContext lock held? Or is it optional now?

If it's a rule, it should be documented.


(Coming back after reviewing more of the patch:)


It doesn't seem to be a rule, or at least not a rule that is obeyed.

Actually each function in job.c belongs in one of at most three
categories: Must hold the AioContext (because it calls callbacks that
need it), may hold the AioContext optionally, and must not hold it
(everything that would cause the deadlocks you're alluding to, but not
explaining, in the commit message).

It is not obvious which function is in which category. So I maintain
that we need some documentation for the assumptions made.

All coroutine_fns (which are called from the job coroutine) should be in
the category that they still run with the AioContext locked, but that
covers only a small minority of functions.

The driver callbacks look mostly correct at least with respect to
AioContext locking, even if their status isn't documented. I suppose
this is the most important part.

> There is only one JobDriver callback, ->free() that assumes that
> the aiocontext lock is held (because it calls bdrv_unref), so for
> now keep that under aiocontext lock.
> 
> Also remove real_job_{lock/unlock}, as they are replaced by the
> public functions.

At least this part is easily verified. All real_job_lock() sections are
part of a larger job_lock() section.

> Signed-off-by: Emanuele Giuseppe Esposito 
> Reviewed-by: Stefan Hajnoczi 
> ---
>  blockdev.c   | 74 +---
>  include/qemu/job.h   | 22 -
>  job-qmp.c| 46 +++--
>  job.c| 84 ++--
>  tests/unit/test-bdrv-drain.c |  4 +-
>  tests/unit/test-block-iothread.c |  2 +-
>  tests/unit/test-blockjob.c   | 15 ++
>  7 files changed, 52 insertions(+), 195 deletions(-)

> diff --git a/include/qemu/job.h b/include/qemu/job.h
> index c144aabefc..676f69bb2e 100644
> --- a/include/qemu/job.h
> +++ b/include/qemu/job.h
> @@ -75,13 +75,14 @@ typedef struct Job {
>  ProgressMeter progress;
>  
>  
> -/** Protected by AioContext lock */
> +/** Protected by job_mutex */
>  
>  /**
>   * AioContext to run the job coroutine in.
> - * This field can be read when holding either the BQL (so we are in
> - * the main loop) or the job_mutex.
> - * It can be only written when we hold *both* BQL and job_mutex.
> + * The job Aiocontext can be read when holding *either*
> + * the BQL (so we are in the main loop) or the job_mutex.
> + * It can only be written when we hold *both* BQL
> + * and the job_mutex.

Old and new version seem to say the same apart from stylistic
differences, so this feels like unnecessary churn.

>   */
>  AioContext *aio_context;
>  
> @@ -106,7 +107,7 @@ typedef struct Job {
>  /**
>   * Set to false by the job while the coroutine has yielded and may be
>   * re-entered by job_enter(). There may still be I/O or event loop 
> activity
> - * pending. Accessed under block_job_mutex (in blockjob.c).
> + * pending. Accessed under job_mutex.
>   *
>   * When the job is deferred to the main loop, busy is true as long as the
>   * bottom half is still pending.
> @@ -322,9 +323,9 @@ typedef enum JobCreateFlags {
>  
>  extern QemuMutex job_mutex;
>  
> -#define JOB_LOCK_GUARD() /* QEMU_LOCK_GUARD(_mutex) */
> +#define JOB_LOCK_GUARD() QEMU_LOCK_GUARD(_mutex)
>  
> -#define WITH_JOB_LOCK_GUARD() /* WITH_QEMU_LOCK_GUARD(_mutex) */
> +#define WITH_JOB_LOCK_GUARD() WITH_QEMU_LOCK_GUARD(_mutex)
>  
>  /**
>   * job_lock:
> @@ -672,7 +673,7 @@ void job_user_cancel_locked(Job *job, bool force, Error 
> **errp);
>   * Returns the return value from the job if the job actually completed
>   * during the call, or -ECANCELED if it was canceled.
>   *
> - * Callers must hold the AioContext lock of job->aio_context.
> + * Called with job_lock held.
>   */
>  int job_cancel_sync(Job *job, bool force);

This doesn't make sense, it describes job_cancel_sync_locked().

>  
> @@ -697,8 +698,7 @@ void job_cancel_sync_all(void);
>   * function).
>   *
>   * Returns the return value from the job.
> - *
> - * Callers must hold 

Re: [PATCH v2 14/15] mtest2make.py: teach suite name that are just "PROJECT"

2022-08-05 Thread Marc-André Lureau
Hi

On Fri, Aug 5, 2022 at 2:39 PM Paolo Bonzini  wrote:

> On 7/12/22 11:35, marcandre.lur...@redhat.com wrote:
> > From: Marc-André Lureau 
> >
> > A subproject test may be simply in the "PROJECT" suite (such as
> > "qemu-common" with the following patches)
> >
> > Signed-off-by: Marc-André Lureau 
> > ---
> >   scripts/mtest2make.py | 7 +--
> >   1 file changed, 5 insertions(+), 2 deletions(-)
> >
> > diff --git a/scripts/mtest2make.py b/scripts/mtest2make.py
> > index 0fe81efbbcec..606821ee2732 100644
> > --- a/scripts/mtest2make.py
> > +++ b/scripts/mtest2make.py
> > @@ -51,8 +51,11 @@ def process_tests(test, targets, suites):
> >
> >   test_suites = test['suite'] or ['default']
> >   for s in test_suites:
> > -# The suite name in the introspection info is "PROJECT:SUITE"
> > -s = s.split(':')[1]
> > +# The suite name in the introspection info is "PROJECT" or
> "PROJECT:SUITE"
> > +try:
> > +s = s.split(':')[1]
> > +except IndexError:
> > +continue
>
> Shouldn't you continue with s begin simply "PROJECT"?  That is, just
>
>  if ':' in s:
>  s = s.split(':')[1]
>
> This way you can do "make check-qemu-common".
>
> >   if s == 'slow' or s == 'thorough':
> >   continue
> >   if s.endswith('-slow'):
>
> and then however these two "ifs" need to be under the case where the
> suite name is "PROJECT:SUITE".
>
>
Thanks for the tips, updated


-- 
Marc-André Lureau


Re: [PATCH v10 17/21] blockjob: protect iostatus field in BlockJob struct

2022-08-05 Thread Kevin Wolf
Am 25.07.2022 um 09:38 hat Emanuele Giuseppe Esposito geschrieben:
> iostatus is the only field (together with .job) that needs
> protection using the job mutex.
> 
> It is set in the main loop (GLOBAL_STATE functions) but read
> in I/O code (block_job_error_action).
> 
> In order to protect it, change block_job_iostatus_set_err
> to block_job_iostatus_set_err_locked(), always called under
> job lock.
> 
> Signed-off-by: Emanuele Giuseppe Esposito 
> ---
>  blockjob.c | 5 +++--
>  1 file changed, 3 insertions(+), 2 deletions(-)
> 
> diff --git a/blockjob.c b/blockjob.c
> index 0663faee2c..448bdb5a53 100644
> --- a/blockjob.c
> +++ b/blockjob.c
> @@ -363,7 +363,8 @@ BlockJobInfo *block_job_query(BlockJob *job, Error **errp)
>  return block_job_query_locked(job, errp);
>  }
>  
> -static void block_job_iostatus_set_err(BlockJob *job, int error)
> +/* Called with job lock held */
> +static void block_job_iostatus_set_err_locked(BlockJob *job, int error)
>  {
>  if (job->iostatus == BLOCK_DEVICE_IO_STATUS_OK) {
>  job->iostatus = error == ENOSPC ? BLOCK_DEVICE_IO_STATUS_NOSPACE :
> @@ -577,8 +578,8 @@ BlockErrorAction block_job_error_action(BlockJob *job, 
> BlockdevOnError on_err,
>   */
>  job->job.user_paused = true;
>  }
> +block_job_iostatus_set_err_locked(job, error);
>  }
> -block_job_iostatus_set_err(job, error);
>  }
>  return action;
>  }

Ah, so this patch does what I asked for in an earlier patch. I wonder if
it should be squashed there.

Vladimir's finding that we have an access in mirror may need a fix, but
while incomplete, this patch isn't wrong:

Reviewed-by: Kevin Wolf 




Re: [PATCH v2 14/15] mtest2make.py: teach suite name that are just "PROJECT"

2022-08-05 Thread Paolo Bonzini

On 7/12/22 11:35, marcandre.lur...@redhat.com wrote:

From: Marc-André Lureau 

A subproject test may be simply in the "PROJECT" suite (such as
"qemu-common" with the following patches)

Signed-off-by: Marc-André Lureau 
---
  scripts/mtest2make.py | 7 +--
  1 file changed, 5 insertions(+), 2 deletions(-)

diff --git a/scripts/mtest2make.py b/scripts/mtest2make.py
index 0fe81efbbcec..606821ee2732 100644
--- a/scripts/mtest2make.py
+++ b/scripts/mtest2make.py
@@ -51,8 +51,11 @@ def process_tests(test, targets, suites):
  
  test_suites = test['suite'] or ['default']

  for s in test_suites:
-# The suite name in the introspection info is "PROJECT:SUITE"
-s = s.split(':')[1]
+# The suite name in the introspection info is "PROJECT" or 
"PROJECT:SUITE"
+try:
+s = s.split(':')[1]
+except IndexError:
+continue


Shouldn't you continue with s begin simply "PROJECT"?  That is, just

if ':' in s:
s = s.split(':')[1]

This way you can do "make check-qemu-common".


  if s == 'slow' or s == 'thorough':
  continue
  if s.endswith('-slow'):


and then however these two "ifs" need to be under the case where the 
suite name is "PROJECT:SUITE".


Paolo



Re: [PATCH v10 16/21] blockjob: rename notifier callbacks as _locked

2022-08-05 Thread Kevin Wolf
Am 25.07.2022 um 09:38 hat Emanuele Giuseppe Esposito geschrieben:
> They all are called with job_lock held, in job_event_*_locked()

Worth documenting that for the notifier lists in struct Job, too?

> Reviewed-by: Vladimir Sementsov-Ogievskiy 
> Reviewed-by: Stefan Hajnoczi 
> Signed-off-by: Emanuele Giuseppe Esposito 

Reviewed-by: Kevin Wolf 




Re: [PATCH v10 15/21] blockjob.h: categorize fields in struct BlockJob

2022-08-05 Thread Kevin Wolf
Am 25.07.2022 um 09:38 hat Emanuele Giuseppe Esposito geschrieben:
> The same job lock is being used also to protect some of blockjob fields.
> Categorize them just as done in job.h.
> 
> Reviewed-by: Vladimir Sementsov-Ogievskiy 
> Signed-off-by: Emanuele Giuseppe Esposito 

Why do speed, limit and blocker stay uncategorised? I think we should
document the locking rules for all fields.

Kevin




Re: [PATCH v10 14/21] jobs: protect job.aio_context with BQL and job_mutex

2022-08-05 Thread Kevin Wolf
Am 25.07.2022 um 09:38 hat Emanuele Giuseppe Esposito geschrieben:
> In order to make it thread safe, implement a "fake rwlock",
> where we allow reads under BQL *or* job_mutex held, but
> writes only under BQL *and* job_mutex.

Oh, so the "or BQL" part is only for job.aio_context? Okay.

> The only write we have is in child_job_set_aio_ctx, which always
> happens under drain (so the job is paused).
> For this reason, introduce job_set_aio_context and make sure that
> the context is set under BQL, job_mutex and drain.
> Also make sure all other places where the aiocontext is read
> are protected.
> 
> Note: at this stage, job_{lock/unlock} and job lock guard macros
> are *nop*.
> 
> Suggested-by: Paolo Bonzini 
> Signed-off-by: Emanuele Giuseppe Esposito 
> Reviewed-by: Stefan Hajnoczi 
> ---
>  block/replication.c |  6 --
>  blockjob.c  |  3 ++-
>  include/qemu/job.h  | 19 ++-
>  job.c   | 12 
>  4 files changed, 36 insertions(+), 4 deletions(-)
> 
> diff --git a/block/replication.c b/block/replication.c
> index 55c8f894aa..2189863df1 100644
> --- a/block/replication.c
> +++ b/block/replication.c
> @@ -148,8 +148,10 @@ static void replication_close(BlockDriverState *bs)
>  }
>  if (s->stage == BLOCK_REPLICATION_FAILOVER) {
>  commit_job = >commit_job->job;
> -assert(commit_job->aio_context == qemu_get_current_aio_context());
> -job_cancel_sync(commit_job, false);
> +WITH_JOB_LOCK_GUARD() {
> +assert(commit_job->aio_context == 
> qemu_get_current_aio_context());
> +job_cancel_sync_locked(commit_job, false);
> +}
>  }

.bdrv_close runs under the BQL, so why is this needed? Maybe a
GLOBAL_STATE_CODE() annotation would be helpful, though.

>  if (s->mode == REPLICATION_MODE_SECONDARY) {
> diff --git a/blockjob.c b/blockjob.c
> index 96fb9d9f73..9ff2727025 100644
> --- a/blockjob.c
> +++ b/blockjob.c
> @@ -162,12 +162,13 @@ static void child_job_set_aio_ctx(BdrvChild *c, 
> AioContext *ctx,
>  bdrv_set_aio_context_ignore(sibling->bs, ctx, ignore);
>  }
>  
> -job->job.aio_context = ctx;
> +job_set_aio_context(>job, ctx);
>  }
>  
>  static AioContext *child_job_get_parent_aio_context(BdrvChild *c)
>  {
>  BlockJob *job = c->opaque;
> +assert(qemu_in_main_thread());

Any reason not to use GLOBAL_STATE_CODE()?

>  return job->job.aio_context;
>  }
> diff --git a/include/qemu/job.h b/include/qemu/job.h
> index 5709e8d4a8..c144aabefc 100644
> --- a/include/qemu/job.h
> +++ b/include/qemu/job.h
> @@ -77,7 +77,12 @@ typedef struct Job {
>  
>  /** Protected by AioContext lock */

I think this section comment should move down below aio_context now.

> -/** AioContext to run the job coroutine in */
> +/**
> + * AioContext to run the job coroutine in.
> + * This field can be read when holding either the BQL (so we are in
> + * the main loop) or the job_mutex.
> + * It can be only written when we hold *both* BQL and job_mutex.
> + */
>  AioContext *aio_context;
>  
>  /** Reference count of the block job */
> @@ -741,4 +746,16 @@ int job_finish_sync(Job *job, void (*finish)(Job *, 
> Error **errp),
>  int job_finish_sync_locked(Job *job, void (*finish)(Job *, Error **errp),
> Error **errp);
>  
> +/**
> + * Sets the @job->aio_context.
> + * Called with job_mutex *not* held.
> + *
> + * This function must run in the main thread to protect against
> + * concurrent read in job_finish_sync_locked(),

Odd line break here in the middle of a sentence.

> + * takes the job_mutex lock to protect against the read in
> + * job_do_yield_locked(), and must be called when the coroutine
> + * is quiescent.
> + */
> +void job_set_aio_context(Job *job, AioContext *ctx);
> +
>  #endif
> diff --git a/job.c b/job.c
> index ecec66b44e..0a857b1468 100644
> --- a/job.c
> +++ b/job.c
> @@ -394,6 +394,17 @@ Job *job_get(const char *id)
>  return job_get_locked(id);
>  }
>  
> +void job_set_aio_context(Job *job, AioContext *ctx)
> +{
> +/* protect against read in job_finish_sync_locked and job_start */
> +assert(qemu_in_main_thread());

Same question about GLOBAL_STATE_CODE().

> +/* protect against read in job_do_yield_locked */
> +JOB_LOCK_GUARD();
> +/* ensure the coroutine is quiescent while the AioContext is changed */
> +assert(job->pause_count > 0);

job->pause_count only shows that pausing was requested. The coroutine is
only really quiescent if job->busy == false, too.

Or maybe job->paused is actually the one you want here.

> +job->aio_context = ctx;
> +}
> +
>  /* Called with job_mutex *not* held. */
>  static void job_sleep_timer_cb(void *opaque)
>  {
> @@ -1376,6 +1387,7 @@ int job_finish_sync_locked(Job *job,
>  {
>  Error *local_err = NULL;
>  int ret;
> +assert(qemu_in_main_thread());
>  
>  job_ref_locked(job);

Another GLOBAL_STATE_CODE()?

Kevin




Re: [PATCH v2 11/15] qemu-common: move scripts/qapi

2022-08-05 Thread Marc-André Lureau
Hi

On Fri, Aug 5, 2022 at 12:12 PM Markus Armbruster  wrote:

> marcandre.lur...@redhat.com writes:
>
> > From: Marc-André Lureau 
> >
> > This is just moving qapi-gen.py and related subdir to qemu-common, to
> > ease review and proceed step by step. The following patches will move
> > related necessary code, tests etc.
> >
> > Signed-off-by: Marc-André Lureau 
>
> As moved files tend to become low-level annoyances for a long time, I'd
> like to understand why you want to move them.  The commit message says
> "to ease review", which I suspect isn't the real reason.  Perhaps you
> explained all that elsewhere already, but I missed it.
>
>
>
The end goal is to split some projects, such as qemu-ga, to standalone
meson projects/subprojects. We will be able to build them independently
from the rest of QEMU, and later on perhaps handle them outside of QEMU
main repository. To achieve this, I first introduce a qemu-common
subproject, where qapi and common units are provided. You can check
https://gitlab.com/marcandre.lureau/qemu/-/commits/qga for a sneak peek at
current end result.

I said "to ease review and proceed step by step" simply because there are
no other changes: I don't move the rest of the qapi code & tests all
together, it's in the subsequent series.

-- 
Marc-André Lureau


Re: [PATCH v10 13/21] job: detect change of aiocontext within job coroutine

2022-08-05 Thread Kevin Wolf
Am 25.07.2022 um 09:38 hat Emanuele Giuseppe Esposito geschrieben:
> From: Paolo Bonzini 
> 
> We want to make sure access of job->aio_context is always done
> under either BQL or job_mutex.

Is this the goal of this series? If so, it would have been useful to
state somewhere more obvious, because I had assumed that holding the BQL
would not be considered enough, but everyone needs to hold the job_mutex.

> The problem is that using
> aio_co_enter(job->aiocontext, job->co) in job_start and job_enter_cond
> makes the coroutine immediately resume, so we can't hold the job lock.
> And caching it is not safe either, as it might change.
> 
> job_start is under BQL, so it can freely read job->aiocontext, but
> job_enter_cond is not. In order to fix this, use aio_co_wake():
> the advantage is that it won't use job->aiocontext, but the
> main disadvantage is that it won't be able to detect a change of
> job AioContext.
> 
> Calling bdrv_try_set_aio_context() will issue the following calls
> (simplified):
> * in terms of  bdrv callbacks:
>   .drained_begin -> .set_aio_context -> .drained_end
> * in terms of child_job functions:
>   child_job_drained_begin -> child_job_set_aio_context -> 
> child_job_drained_end
> * in terms of job functions:
>   job_pause_locked -> job_set_aio_context -> job_resume_locked
> 
> We can see that after setting the new aio_context, job_resume_locked
> calls again job_enter_cond, which then invokes aio_co_wake(). But
> while job->aiocontext has been set in job_set_aio_context,
> job->co->ctx has not changed, so the coroutine would be entering in
> the wrong aiocontext.
> 
> Using aio_co_schedule in job_resume_locked() might seem as a valid
> alternative, but the problem is that the bh resuming the coroutine
> is not scheduled immediately, and if in the meanwhile another
> bdrv_try_set_aio_context() is run (see test_propagate_mirror() in
> test-block-iothread.c), we would have the first schedule in the
> wrong aiocontext, and the second set of drains won't even manage
> to schedule the coroutine, as job->busy would still be true from
> the previous job_resume_locked().
> 
> The solution is to stick with aio_co_wake(), but then detect every time
> the coroutine resumes back from yielding if job->aio_context
> has changed. If so, we can reschedule it to the new context.

Hm, but with this in place, what does aio_co_wake() actually buy us
compared to aio_co_enter()?

I guess it's a bit simpler code because you don't have to explicitly
specify the AioContext, but we're still going to enter the coroutine in
the wrong AioContext occasionally and have to reschedule it, just like
in the existing code (except that the rescheduling doesn't exist there
yet).

So while I don't disagree with the change, I don't think the
justification in the commit message is right for this part.

> Check for the aiocontext change in job_do_yield_locked because:
> 1) aio_co_reschedule_self requires to be in the running coroutine
> 2) since child_job_set_aio_context allows changing the aiocontext only
>while the job is paused, this is the exact place where the coroutine
>resumes, before running JobDriver's code.
> 
> Reviewed-by: Vladimir Sementsov-Ogievskiy 
> Reviewed-by: Stefan Hajnoczi 
> Signed-off-by: Paolo Bonzini 
> ---
>  job.c | 21 +++--
>  1 file changed, 19 insertions(+), 2 deletions(-)
> 
> diff --git a/job.c b/job.c
> index b0729e2bb2..ecec66b44e 100644
> --- a/job.c
> +++ b/job.c
> @@ -585,7 +585,9 @@ void job_enter_cond_locked(Job *job, bool(*fn)(Job *job))
>  timer_del(>sleep_timer);
>  job->busy = true;
>  real_job_unlock();
> -aio_co_enter(job->aio_context, job->co);
> +job_unlock();
> +aio_co_wake(job->co);
> +job_lock();

The addition of job_unlock/lock is unrelated, this was necessary even
before this patch.

>  }
>  
>  void job_enter_cond(Job *job, bool(*fn)(Job *job))
> @@ -611,6 +613,8 @@ void job_enter(Job *job)
>   */
>  static void coroutine_fn job_do_yield_locked(Job *job, uint64_t ns)
>  {
> +AioContext *next_aio_context;
> +
>  real_job_lock();
>  if (ns != -1) {
>  timer_mod(>sleep_timer, ns);
> @@ -622,7 +626,20 @@ static void coroutine_fn job_do_yield_locked(Job *job, 
> uint64_t ns)
>  qemu_coroutine_yield();
>  job_lock();
>  
> -/* Set by job_enter_cond() before re-entering the coroutine.  */
> +next_aio_context = job->aio_context;
> +/*
> + * Coroutine has resumed, but in the meanwhile the job AioContext
> + * might have changed via bdrv_try_set_aio_context(), so we need to move
> + * the coroutine too in the new aiocontext.
> + */
> +while (qemu_get_current_aio_context() != next_aio_context) {
> +job_unlock();
> +aio_co_reschedule_self(next_aio_context);
> +job_lock();
> +next_aio_context = job->aio_context;
> +}
> +
> +/* Set by job_enter_cond_locked() before re-entering the coroutine.  */
>  assert(job->busy);
>  }

The 

Re: [PATCH v10 12/21] commit and mirror: create new nodes using bdrv_get_aio_context, and not the job aiocontext

2022-08-05 Thread Kevin Wolf
Am 25.07.2022 um 09:38 hat Emanuele Giuseppe Esposito geschrieben:
> We are always using the given bs AioContext, so there is no need
> to take the job ones (which is identical anyways).
> This also reduces the point we need to check when protecting
> job.aio_context field.
> 
> Reviewed-by: Vladimir Sementsov-Ogievskiy 
> Reviewed-by: Stefan Hajnoczi 
> Signed-off-by: Emanuele Giuseppe Esposito 

I'm not sure against which scenario this is actually protecting. The
only reason for not using job.aio_context seems to be if someone else
can change the job AioContext in parallel. However, if that is the case
(I don't think it is, but for the hypothetical case this patch seems to
address), the AioContext is not identical any more and the change is
wrong because we actually want things to run in the job AioContext -
otherwise accessing the BlockBackend from the job coroutine wouldn't be
possible.

So I believe the current code expresses how we actually want to use the
BlockBackend and the change doesn't only protect nothing, but is even
misleading because it implies that the job can work with any AioContext,
which is not true.

Kevin




Re: [PATCH v2 11/15] qemu-common: move scripts/qapi

2022-08-05 Thread Markus Armbruster
marcandre.lur...@redhat.com writes:

> From: Marc-André Lureau 
>
> This is just moving qapi-gen.py and related subdir to qemu-common, to
> ease review and proceed step by step. The following patches will move
> related necessary code, tests etc.
>
> Signed-off-by: Marc-André Lureau 

As moved files tend to become low-level annoyances for a long time, I'd
like to understand why you want to move them.  The commit message says
"to ease review", which I suspect isn't the real reason.  Perhaps you
explained all that elsewhere already, but I missed it.




[PATCH v6 7/8] qemu-iotests: test new zone operations

2022-08-05 Thread Sam Li
We have added new block layer APIs of zoned block devices. Test it with:
Create a null_blk device, run each zone operation on it and see
whether reporting right zone information.

Signed-off-by: Sam Li 
Reviewed-by: Stefan Hajnoczi 
---
 tests/qemu-iotests/tests/zoned.out | 53 ++
 tests/qemu-iotests/tests/zoned.sh  | 86 ++
 2 files changed, 139 insertions(+)
 create mode 100644 tests/qemu-iotests/tests/zoned.out
 create mode 100755 tests/qemu-iotests/tests/zoned.sh

diff --git a/tests/qemu-iotests/tests/zoned.out 
b/tests/qemu-iotests/tests/zoned.out
new file mode 100644
index 00..d09be2ffcd
--- /dev/null
+++ b/tests/qemu-iotests/tests/zoned.out
@@ -0,0 +1,53 @@
+QA output created by zoned.sh
+Testing a null_blk device:
+Simple cases: if the operations work
+(1) report the first zone:
+start: 0x0, len 0x8, cap 0x8,wptr 0x0, zcond:1, [type: 2]
+
+report the first 10 zones
+start: 0x0, len 0x8, cap 0x8,wptr 0x0, zcond:1, [type: 2]
+start: 0x8, len 0x8, cap 0x8,wptr 0x8, zcond:1, [type: 2]
+start: 0x10, len 0x8, cap 0x8,wptr 0x10, zcond:1, [type: 2]
+start: 0x18, len 0x8, cap 0x8,wptr 0x18, zcond:1, [type: 2]
+start: 0x20, len 0x8, cap 0x8,wptr 0x20, zcond:1, [type: 2]
+start: 0x28, len 0x8, cap 0x8,wptr 0x28, zcond:1, [type: 2]
+start: 0x30, len 0x8, cap 0x8,wptr 0x30, zcond:1, [type: 2]
+start: 0x38, len 0x8, cap 0x8,wptr 0x38, zcond:1, [type: 2]
+start: 0x40, len 0x8, cap 0x8,wptr 0x40, zcond:1, [type: 2]
+start: 0x48, len 0x8, cap 0x8,wptr 0x48, zcond:1, [type: 2]
+
+report the last zone:
+start: 0x1f38, len 0x8, cap 0x8,wptr 0x1f38, zcond:1, [type: 2]
+
+
+(2) opening the first zone
+report after:
+start: 0x0, len 0x8, cap 0x8,wptr 0x0, zcond:3, [type: 2]
+
+opening the second zone
+report after:
+start: 0x8, len 0x8, cap 0x8,wptr 0x8, zcond:3, [type: 2]
+
+opening the last zone
+report after:
+start: 0x1f38, len 0x8, cap 0x8,wptr 0x1f38, zcond:3, [type: 2]
+
+
+(3) closing the first zone
+report after:
+start: 0x0, len 0x8, cap 0x8,wptr 0x0, zcond:1, [type: 2]
+
+closing the last zone
+report after:
+start: 0x1f38, len 0x8, cap 0x8,wptr 0x1f38, zcond:1, [type: 2]
+
+
+(4) finishing the second zone
+After finishing a zone:
+start: 0x8, len 0x8, cap 0x8,wptr 0x10, zcond:14, [type: 2]
+
+
+(5) resetting the second zone
+After resetting a zone:
+start: 0x8, len 0x8, cap 0x8,wptr 0x8, zcond:1, [type: 2]
+*** done
diff --git a/tests/qemu-iotests/tests/zoned.sh 
b/tests/qemu-iotests/tests/zoned.sh
new file mode 100755
index 00..db68aa88d4
--- /dev/null
+++ b/tests/qemu-iotests/tests/zoned.sh
@@ -0,0 +1,86 @@
+#!/usr/bin/env bash
+#
+# Test zone management operations.
+#
+
+seq="$(basename $0)"
+echo "QA output created by $seq"
+status=1 # failure is the default!
+
+_cleanup()
+{
+  _cleanup_test_img
+  sudo rmmod null_blk
+}
+trap "_cleanup; exit \$status" 0 1 2 3 15
+
+# get standard environment, filters and checks
+. ./common.rc
+. ./common.filter
+. ./common.qemu
+
+# This test only runs on Linux hosts with raw image files.
+_supported_fmt raw
+_supported_proto file
+_supported_os Linux
+
+QEMU_IO="build/qemu-io"
+IMG="--image-opts driver=zoned_host_device,filename=/dev/nullb0"
+QEMU_IO_OPTIONS=$QEMU_IO_OPTIONS_NO_FMT
+
+echo "Testing a null_blk device:"
+echo "Simple cases: if the operations work"
+sudo modprobe null_blk nr_devices=1 zoned=1
+
+echo "(1) report the first zone:"
+sudo $QEMU_IO $IMG -c "zrp 0 1"
+echo
+echo "report the first 10 zones"
+sudo $QEMU_IO $IMG -c "zrp 0 10"
+echo
+echo "report the last zone:"
+sudo $QEMU_IO $IMG -c "zrp 0x3e7000 2"
+echo
+echo
+echo "(2) opening the first zone"
+sudo $QEMU_IO $IMG -c "zo 0 0x8"
+echo "report after:"
+sudo $QEMU_IO $IMG -c "zrp 0 1"
+echo
+echo "opening the second zone"
+sudo $QEMU_IO $IMG -c "zo 524288 0x8" # 524288 is the zone sector size
+echo "report after:"
+sudo $QEMU_IO $IMG -c "zrp 268435456 1" # 268435456 / 512 = 524288
+echo
+echo "opening the last zone"
+sudo $QEMU_IO $IMG -c "zo 0x1f38 0x8"
+echo "report after:"
+sudo $QEMU_IO $IMG -c "zrp 0x3e7000 2"
+echo
+echo
+echo "(3) closing the first zone"
+sudo $QEMU_IO $IMG -c "zc 0 0x8"
+echo "report after:"
+sudo $QEMU_IO $IMG -c "zrp 0 1"
+echo
+echo "closing the last zone"
+sudo $QEMU_IO $IMG -c "zc 0x1f38 0x8"
+echo "report after:"
+sudo $QEMU_IO $IMG -c "zrp 0x3e7000 2"
+echo
+echo
+echo "(4) finishing the second zone"
+sudo $QEMU_IO $IMG -c "zf 524288 0x8"
+echo "After finishing a zone:"
+sudo $QEMU_IO $IMG -c "zrp 268435456 1"
+echo
+echo
+
+echo "(5) resetting the second zone"
+sudo $QEMU_IO $IMG -c "zrs 524288 0x8"
+echo "After resetting a zone:"
+sudo $QEMU_IO $IMG -c "zrp 268435456 1"
+# success, all done

[PATCH v6 5/8] raw-format: add zone operations to pass through requests

2022-08-05 Thread Sam Li
raw-format driver usually sits on top of file-posix driver. It needs to
pass through requests of zone commands.

Signed-off-by: Sam Li 
Reviewed-by: Stefan Hajnoczi 
---
 block/raw-format.c | 13 +
 1 file changed, 13 insertions(+)

diff --git a/block/raw-format.c b/block/raw-format.c
index 69fd650eaf..6b20bd22ef 100644
--- a/block/raw-format.c
+++ b/block/raw-format.c
@@ -314,6 +314,17 @@ static int coroutine_fn raw_co_pdiscard(BlockDriverState 
*bs,
 return bdrv_co_pdiscard(bs->file, offset, bytes);
 }
 
+static int coroutine_fn raw_co_zone_report(BlockDriverState *bs, int64_t 
offset,
+   unsigned int *nr_zones,
+   BlockZoneDescriptor *zones) {
+return bdrv_co_zone_report(bs->file->bs, offset, nr_zones, zones);
+}
+
+static int coroutine_fn raw_co_zone_mgmt(BlockDriverState *bs, BlockZoneOp op,
+ int64_t offset, int64_t len) {
+return bdrv_co_zone_mgmt(bs->file->bs, op, offset, len);
+}
+
 static int64_t raw_getlength(BlockDriverState *bs)
 {
 int64_t len;
@@ -614,6 +625,8 @@ BlockDriver bdrv_raw = {
 .bdrv_co_pwritev  = _co_pwritev,
 .bdrv_co_pwrite_zeroes = _co_pwrite_zeroes,
 .bdrv_co_pdiscard = _co_pdiscard,
+.bdrv_co_zone_report  = _co_zone_report,
+.bdrv_co_zone_mgmt  = _co_zone_mgmt,
 .bdrv_co_block_status = _co_block_status,
 .bdrv_co_copy_range_from = _co_copy_range_from,
 .bdrv_co_copy_range_to  = _co_copy_range_to,
-- 
2.37.1




[PATCH v6 3/8] file-posix: introduce get_sysfs_str_val for device zoned model

2022-08-05 Thread Sam Li
Use sysfs attribute files to get the string value of device
zoned model. Then get_sysfs_zoned_model can convert it to
BlockZoneModel type in QEMU.

Signed-off-by: Sam Li 
Reviewed-by: Hannes Reinecke 
---
 block/file-posix.c   | 70 
 include/block/block_int-common.h |  3 ++
 2 files changed, 73 insertions(+)

diff --git a/block/file-posix.c b/block/file-posix.c
index a40eab64a2..4785203eea 100644
--- a/block/file-posix.c
+++ b/block/file-posix.c
@@ -1264,6 +1264,68 @@ out:
 #endif
 }
 
+/*
+ * Convert the zoned attribute file in sysfs to internal value.
+ */
+static int get_sysfs_str_val(int fd, struct stat *st,
+  const char *attribute,
+  char **val) {
+#ifdef CONFIG_LINUX
+char *buf = NULL;
+g_autofree char *sysfspath = NULL;
+int ret;
+size_t len;
+
+if (!S_ISBLK(st->st_mode)) {
+return -ENOTSUP;
+}
+
+sysfspath = g_strdup_printf("/sys/dev/block/%u:%u/queue/%s",
+major(st->st_rdev), minor(st->st_rdev),
+attribute);
+ret = g_file_get_contents(sysfspath, , , NULL);
+if (ret == -1) {
+ret = -errno;
+return ret;
+}
+
+/* The file is ended with '\n' */
+if (buf[len - 1] == '\n') {
+buf[len - 1] = '\0';
+}
+
+if (!strncpy(*val, buf, len)) {
+ret = -errno;
+return ret;
+}
+g_free(buf);
+return 0;
+#else
+return -ENOTSUP;
+#endif
+}
+
+static int get_sysfs_zoned_model(int fd, struct stat *st,
+ BlockZoneModel *zoned) {
+g_autofree char *val = NULL;
+val = g_malloc(32);
+get_sysfs_str_val(fd, st, "zoned", );
+if (!val) {
+return -ENOTSUP;
+}
+
+if (strcmp(val, "host-managed") == 0) {
+*zoned = BLK_Z_HM;
+} else if (strcmp(val, "host-aware") == 0) {
+*zoned = BLK_Z_HA;
+} else if (strcmp(val, "none") == 0) {
+*zoned = BLK_Z_NONE;
+} else {
+return -ENOTSUP;
+}
+return 0;
+}
+
 static int hdev_get_max_segments(int fd, struct stat *st) {
 int ret;
 if (S_ISCHR(st->st_mode)) {
@@ -1279,6 +1341,8 @@ static void raw_refresh_limits(BlockDriverState *bs, 
Error **errp)
 {
 BDRVRawState *s = bs->opaque;
 struct stat st;
+int ret;
+BlockZoneModel zoned;
 
 s->needs_alignment = raw_needs_alignment(bs);
 raw_probe_alignment(bs, s->fd, errp);
@@ -1316,6 +1380,12 @@ static void raw_refresh_limits(BlockDriverState *bs, 
Error **errp)
 bs->bl.max_hw_iov = ret;
 }
 }
+
+ret = get_sysfs_zoned_model(s->fd, , );
+if (ret < 0) {
+zoned = BLK_Z_NONE;
+}
+bs->bl.zoned = zoned;
 }
 
 static int check_for_dasd(int fd)
diff --git a/include/block/block_int-common.h b/include/block/block_int-common.h
index 8947abab76..7f7863cc9e 100644
--- a/include/block/block_int-common.h
+++ b/include/block/block_int-common.h
@@ -825,6 +825,9 @@ typedef struct BlockLimits {
 
 /* maximum number of iovec elements */
 int max_iov;
+
+/* device zone model */
+BlockZoneModel zoned;
 } BlockLimits;
 
 typedef struct BdrvOpBlocker BdrvOpBlocker;
-- 
2.37.1




[PATCH v6 8/8] docs/zoned-storage: add zoned device documentation

2022-08-05 Thread Sam Li
Add the documentation about the zoned device support to virtio-blk
emulation.

Signed-off-by: Sam Li 
---
 docs/devel/zoned-storage.rst   | 41 ++
 docs/system/qemu-block-drivers.rst.inc |  6 
 2 files changed, 47 insertions(+)
 create mode 100644 docs/devel/zoned-storage.rst

diff --git a/docs/devel/zoned-storage.rst b/docs/devel/zoned-storage.rst
new file mode 100644
index 00..c3f1e477ac
--- /dev/null
+++ b/docs/devel/zoned-storage.rst
@@ -0,0 +1,41 @@
+=
+zoned-storage
+=
+
+Zoned Block Devices (ZBDs) devide the LBA space into block regions called zones
+that are larger than the LBA size. It can only allow sequential writes, which
+reduces write amplification in SSDs, leading to higher throughput and increased
+capacity. More details about ZBDs can be found at:
+
+https://zonedstorage.io/docs/introduction/zoned-storage
+
+1. Block layer APIs for zoned storage
+-
+QEMU block layer has three zoned storage model:
+- BLK_Z_HM: This model only allows sequential writes access. It supports a set
+of ZBD-specific I/O request that used by the host to manage device zones.
+- BLK_Z_HA: It deals with both sequential writes and random writes access.
+- BLK_Z_NONE: Regular block devices and drive-managed ZBDs are treated as
+non-zoned devices.
+
+The block device information is resided inside BlockDriverState. QEMU uses
+BlockLimits struct(BlockDriverState::bl) that is continuously accessed by the
+block layer while processing I/O requests. A BlockBackend has a root pointer to
+a BlockDriverState graph(for example, raw format on top of file-posix). The
+zoned storage information can be propagated from the leaf BlockDriverState all
+the way up to the BlockBackend. If the zoned storage model in file-posix is
+set to BLK_Z_HM, then block drivers will declare support for zoned host device.
+
+The block layer APIs support commands needed for zoned storage devices,
+including report zones, four zone operations, and zone append.
+
+2. Emulating zoned storage controllers
+--
+When the BlockBackend's BlockLimits model reports a zoned storage device, users
+like the virtio-blk emulation or the qemu-io-cmds.c utility can use block layer
+APIs for zoned storage emulation or testing.
+
+For example, the command line for zone report testing a null_blk device of
+qemu-io-cmds.c is:
+$ path/to/qemu-io --image-opts driver=zoned_host_device,filename=/dev/nullb0 -c
+"zrp offset nr_zones"
diff --git a/docs/system/qemu-block-drivers.rst.inc 
b/docs/system/qemu-block-drivers.rst.inc
index dfe5d2293d..0b97227fd9 100644
--- a/docs/system/qemu-block-drivers.rst.inc
+++ b/docs/system/qemu-block-drivers.rst.inc
@@ -430,6 +430,12 @@ Hard disks
   you may corrupt your host data (use the ``-snapshot`` command
   line option or modify the device permissions accordingly).
 
+Zoned block devices
+  Zoned block devices can be passed through to the guest if the emulated 
storage
+  controller supports zoned storage. Use ``--blockdev zoned_host_device,
+  node-name=drive0,filename=/dev/nullb0`` to pass through ``/dev/nullb0``
+  as ``drive0``.
+
 Windows
 ^^^
 
-- 
2.37.1




[PATCH v6 4/8] block: add block layer APIs resembling Linux ZonedBlockDevice ioctls

2022-08-05 Thread Sam Li
By adding zone management operations in BlockDriver, storage controller
emulation can use the new block layer APIs including Report Zone and
four zone management operations (open, close, finish, reset).

Add zoned storage commands of the device: zone_report(zrp), zone_open(zo),
zone_close(zc), zone_reset(zrs), zone_finish(zf).

For example, to test zone_report, use following command:
$ ./build/qemu-io --image-opts driver=zoned_host_device, filename=/dev/nullb0
-c "zrp offset nr_zones"

Signed-off-by: Sam Li 
Reviewed-by: Hannes Reinecke 
---
 block/block-backend.c|  50 +
 block/coroutines.h   |   6 +
 block/file-posix.c   | 315 ++-
 block/io.c   |  41 
 include/block/block-common.h |   1 -
 include/block/block-io.h |  13 ++
 include/block/block_int-common.h |  22 ++-
 include/block/raw-aio.h  |   6 +-
 meson.build  |   1 +
 qapi/block-core.json |   8 +-
 qemu-io-cmds.c   | 144 ++
 11 files changed, 601 insertions(+), 6 deletions(-)

diff --git a/block/block-backend.c b/block/block-backend.c
index d4a5df2ac2..fc639b0cd7 100644
--- a/block/block-backend.c
+++ b/block/block-backend.c
@@ -1775,6 +1775,56 @@ int coroutine_fn blk_co_flush(BlockBackend *blk)
 return ret;
 }
 
+/*
+ * Send a zone_report command.
+ * offset is a byte offset from the start of the device. No alignment
+ * required for offset.
+ * nr_zones represents IN maximum and OUT actual.
+ */
+int coroutine_fn blk_co_zone_report(BlockBackend *blk, int64_t offset,
+unsigned int *nr_zones,
+BlockZoneDescriptor *zones)
+{
+int ret;
+IO_CODE();
+
+blk_inc_in_flight(blk); /* increase before waiting */
+blk_wait_while_drained(blk);
+if (!blk_is_available(blk)) {
+blk_dec_in_flight(blk);
+return -ENOMEDIUM;
+}
+ret = bdrv_co_zone_report(blk_bs(blk), offset, nr_zones, zones);
+blk_dec_in_flight(blk);
+return ret;
+}
+
+/*
+ * Send a zone_management command.
+ * offset is the starting zone specified as a sector offset.
+ * len is the maximum number of sectors the command should operate on.
+ */
+int coroutine_fn blk_co_zone_mgmt(BlockBackend *blk, BlockZoneOp op,
+int64_t offset, int64_t len)
+{
+int ret;
+IO_CODE();
+
+ret = blk_check_byte_request(blk, offset, len);
+if (ret < 0) {
+return ret;
+}
+blk_inc_in_flight(blk);
+blk_wait_while_drained(blk);
+if (!blk_is_available(blk)) {
+blk_dec_in_flight(blk);
+return -ENOMEDIUM;
+}
+ret = bdrv_co_zone_mgmt(blk_bs(blk), op, offset, len);
+blk_dec_in_flight(blk);
+return ret;
+}
+
 void blk_drain(BlockBackend *blk)
 {
 BlockDriverState *bs = blk_bs(blk);
diff --git a/block/coroutines.h b/block/coroutines.h
index 3a2bad564f..e3f62d94e5 100644
--- a/block/coroutines.h
+++ b/block/coroutines.h
@@ -63,6 +63,12 @@ nbd_co_do_establish_connection(BlockDriverState *bs, bool 
blocking,
Error **errp);
 
 
+int coroutine_fn blk_co_zone_report(BlockBackend *blk, int64_t offset,
+unsigned int *nr_zones,
+BlockZoneDescriptor *zones);
+int coroutine_fn blk_co_zone_mgmt(BlockBackend *blk, BlockZoneOp op,
+  int64_t offset, int64_t len);
+
 /*
  * "I/O or GS" API functions. These functions can run without
  * the BQL, but only in one specific iothread/main loop.
diff --git a/block/file-posix.c b/block/file-posix.c
index 4785203eea..2627431581 100644
--- a/block/file-posix.c
+++ b/block/file-posix.c
@@ -67,6 +67,9 @@
 #include 
 #include 
 #include 
+#if defined(CONFIG_BLKZONED)
+#include 
+#endif
 #include 
 #include 
 #include 
@@ -216,6 +219,13 @@ typedef struct RawPosixAIOData {
 PreallocMode prealloc;
 Error **errp;
 } truncate;
+struct {
+unsigned int *nr_zones;
+BlockZoneDescriptor *zones;
+} zone_report;
+struct {
+BlockZoneOp op;
+} zone_mgmt;
 };
 } RawPosixAIOData;
 
@@ -1369,7 +1379,7 @@ static void raw_refresh_limits(BlockDriverState *bs, 
Error **errp)
 #endif
 
 if (bs->sg || S_ISBLK(st.st_mode)) {
-int ret = hdev_get_max_hw_transfer(s->fd, );
+ret = hdev_get_max_hw_transfer(s->fd, );
 
 if (ret > 0 && ret <= BDRV_REQUEST_MAX_BYTES) {
 bs->bl.max_hw_transfer = ret;
@@ -1386,6 +1396,27 @@ static void raw_refresh_limits(BlockDriverState *bs, 
Error **errp)
 zoned = BLK_Z_NONE;
 }
 bs->bl.zoned = zoned;
+if (zoned != BLK_Z_NONE) {
+ret = get_sysfs_long_val(s->fd, , "chunk_sectors");
+if (ret > 0) {
+bs->bl.zone_sectors = ret;
+}
+
+ret = get_sysfs_long_val(s->fd, , "zone_append_max_bytes");
+

[PATCH v6 6/8] config: add check to block layer

2022-08-05 Thread Sam Li
Putting zoned/non-zoned BlockDrivers on top of each other is not
allowed.

Signed-off-by: Sam Li 
Reviewed-by: Stefan Hajnoczi 
---
 block.c  | 13 +
 block/file-posix.c   |  1 +
 block/raw-format.c   |  1 +
 include/block/block_int-common.h | 10 ++
 4 files changed, 25 insertions(+)

diff --git a/block.c b/block.c
index bc85f46eed..8a259b158c 100644
--- a/block.c
+++ b/block.c
@@ -7947,6 +7947,19 @@ void bdrv_add_child(BlockDriverState *parent_bs, 
BlockDriverState *child_bs,
 return;
 }
 
+/*
+ * Non-zoned block drivers do not follow zoned storage constraints
+ * (i.e. sequential writes to zones). Refuse mixing zoned and non-zoned
+ * drivers in a graph.
+ */
+if (!parent_bs->drv->supports_zoned_children && child_bs->drv->is_zoned) {
+error_setg(errp, "Cannot add a %s child to a %s parent",
+   child_bs->drv->is_zoned ? "zoned" : "non-zoned",
+   parent_bs->drv->supports_zoned_children ?
+   "support zoned children" : "not support zoned children");
+return;
+}
+
 if (!QLIST_EMPTY(_bs->parents)) {
 error_setg(errp, "The node %s already has a parent",
child_bs->node_name);
diff --git a/block/file-posix.c b/block/file-posix.c
index 2627431581..7ab39eb291 100644
--- a/block/file-posix.c
+++ b/block/file-posix.c
@@ -4048,6 +4048,7 @@ static BlockDriver bdrv_zoned_host_device = {
 .format_name = "zoned_host_device",
 .protocol_name = "zoned_host_device",
 .instance_size = sizeof(BDRVRawState),
+.is_zoned = true,
 .bdrv_needs_filename = true,
 .bdrv_probe_device  = hdev_probe_device,
 .bdrv_parse_filename = zoned_host_device_parse_filename,
diff --git a/block/raw-format.c b/block/raw-format.c
index 6b20bd22ef..9441536819 100644
--- a/block/raw-format.c
+++ b/block/raw-format.c
@@ -614,6 +614,7 @@ static void raw_child_perm(BlockDriverState *bs, BdrvChild 
*c,
 BlockDriver bdrv_raw = {
 .format_name  = "raw",
 .instance_size= sizeof(BDRVRawState),
+.supports_zoned_children = true,
 .bdrv_probe   = _probe,
 .bdrv_reopen_prepare  = _reopen_prepare,
 .bdrv_reopen_commit   = _reopen_commit,
diff --git a/include/block/block_int-common.h b/include/block/block_int-common.h
index de44c7b6f4..0476cd0491 100644
--- a/include/block/block_int-common.h
+++ b/include/block/block_int-common.h
@@ -126,6 +126,16 @@ struct BlockDriver {
  */
 bool is_format;
 
+/*
+ * Set to true if the BlockDriver is a zoned block driver.
+ */
+bool is_zoned;
+
+/*
+ * Set to true if the BlockDriver supports zoned children.
+ */
+bool supports_zoned_children;
+
 /*
  * Drivers not implementing bdrv_parse_filename nor bdrv_open should have
  * this field set to true, except ones that are defined only by their
-- 
2.37.1




[PATCH v6 2/8] file-posix: introduce get_sysfs_long_val for the long sysfs attribute

2022-08-05 Thread Sam Li
Use sysfs attribute files to get the long value of zoned device
information.

Signed-off-by: Sam Li 
Reviewed-by: Hannes Reinecke 
---
 block/file-posix.c | 37 +++--
 1 file changed, 23 insertions(+), 14 deletions(-)

diff --git a/block/file-posix.c b/block/file-posix.c
index 48cd096624..a40eab64a2 100644
--- a/block/file-posix.c
+++ b/block/file-posix.c
@@ -1210,29 +1210,27 @@ static int hdev_get_max_hw_transfer(int fd, struct stat 
*st)
 #endif
 }
 
-static int hdev_get_max_segments(int fd, struct stat *st)
-{
+/*
+ * Get zoned device information (chunk_sectors, zoned_append_max_bytes,
+ * max_open_zones, max_active_zones) through sysfs attribute files.
+ */
+static long get_sysfs_long_val(int fd, struct stat *st,
+   const char *attribute) {
 #ifdef CONFIG_LINUX
 char buf[32];
 const char *end;
 char *sysfspath = NULL;
 int ret;
 int sysfd = -1;
-long max_segments;
-
-if (S_ISCHR(st->st_mode)) {
-if (ioctl(fd, SG_GET_SG_TABLESIZE, ) == 0) {
-return ret;
-}
-return -ENOTSUP;
-}
+long val;
 
 if (!S_ISBLK(st->st_mode)) {
 return -ENOTSUP;
 }
 
-sysfspath = g_strdup_printf("/sys/dev/block/%u:%u/queue/max_segments",
-major(st->st_rdev), minor(st->st_rdev));
+sysfspath = g_strdup_printf("/sys/dev/block/%u:%u/queue/%s",
+major(st->st_rdev), minor(st->st_rdev),
+attribute);
 sysfd = open(sysfspath, O_RDONLY);
 if (sysfd == -1) {
 ret = -errno;
@@ -1250,9 +1248,9 @@ static int hdev_get_max_segments(int fd, struct stat *st)
 }
 buf[ret] = 0;
 /* The file is ended with '\n', pass 'end' to accept that. */
-ret = qemu_strtol(buf, , 10, _segments);
+ret = qemu_strtol(buf, , 10, );
 if (ret == 0 && end && *end == '\n') {
-ret = max_segments;
+ret = val;
 }
 
 out:
@@ -1266,6 +1264,17 @@ out:
 #endif
 }
 
+static int hdev_get_max_segments(int fd, struct stat *st) {
+int ret;
+if (S_ISCHR(st->st_mode)) {
+if (ioctl(fd, SG_GET_SG_TABLESIZE, ) == 0) {
+return ret;
+}
+return -ENOTSUP;
+}
+return get_sysfs_long_val(fd, st, "max_segments");
+}
+
 static void raw_refresh_limits(BlockDriverState *bs, Error **errp)
 {
 BDRVRawState *s = bs->opaque;
-- 
2.37.1




[PATCH v6 1/8] include: add zoned device structs

2022-08-05 Thread Sam Li
Signed-off-by: Sam Li 
Reviewed-by: Stefan Hajnoczi 
---
 include/block/block-common.h | 43 
 1 file changed, 43 insertions(+)

diff --git a/include/block/block-common.h b/include/block/block-common.h
index fdb7306e78..36bd0e480e 100644
--- a/include/block/block-common.h
+++ b/include/block/block-common.h
@@ -49,6 +49,49 @@ typedef struct BlockDriver BlockDriver;
 typedef struct BdrvChild BdrvChild;
 typedef struct BdrvChildClass BdrvChildClass;
 
+typedef enum BlockZoneOp {
+BLK_ZO_OPEN,
+BLK_ZO_CLOSE,
+BLK_ZO_FINISH,
+BLK_ZO_RESET,
+} BlockZoneOp;
+
+typedef enum BlockZoneModel {
+BLK_Z_NONE = 0x0, /* Regular block device */
+BLK_Z_HM = 0x1, /* Host-managed zoned block device */
+BLK_Z_HA = 0x2, /* Host-aware zoned block device */
+} BlockZoneModel;
+
+typedef enum BlockZoneCondition {
+BLK_ZS_NOT_WP = 0x0,
+BLK_ZS_EMPTY = 0x1,
+BLK_ZS_IOPEN = 0x2,
+BLK_ZS_EOPEN = 0x3,
+BLK_ZS_CLOSED = 0x4,
+BLK_ZS_RDONLY = 0xD,
+BLK_ZS_FULL = 0xE,
+BLK_ZS_OFFLINE = 0xF,
+} BlockZoneCondition;
+
+typedef enum BlockZoneType {
+BLK_ZT_CONV = 0x1, /* Conventional random writes supported */
+BLK_ZT_SWR = 0x2, /* Sequential writes required */
+BLK_ZT_SWP = 0x3, /* Sequential writes preferred */
+} BlockZoneType;
+
+/*
+ * Zone descriptor data structure.
+ * Provides information on a zone with all position and size values in bytes.
+ */
+typedef struct BlockZoneDescriptor {
+uint64_t start;
+uint64_t length;
+uint64_t cap;
+uint64_t wp;
+BlockZoneType type;
+BlockZoneCondition cond;
+} BlockZoneDescriptor;
+
 typedef struct BlockDriverInfo {
 /* in bytes, 0 if irrelevant */
 int cluster_size;
-- 
2.37.1




[PATCH v6 0/8] Add support for zoned device

2022-08-05 Thread Sam Li
Zoned Block Devices (ZBDs) devide the LBA space to block regions called zones
that are larger than the LBA size. It can only allow sequential writes, which
reduces write amplification in SSD, leading to higher throughput and increased
capacity. More details about ZBDs can be found at:

https://zonedstorage.io/docs/introduction/zoned-storage

The zoned device support aims to let guests (virtual machines) access zoned
storage devices on the host (hypervisor) through a virtio-blk device. This
involves extending QEMU's block layer and virtio-blk emulation code.  In its
current status, the virtio-blk device is not aware of ZBDs but the guest sees
host-managed drives as regular drive that will runs correctly under the most
common write workloads.

This patch series extend the block layer APIs with the minimum set of zoned 
commands that are necessary to support zoned devices. The commands are - Report 
Zones, four zone operations and Zone Append (developing).

It can be tested on a null_blk device using qemu-io or qemu-iotests. For 
example, the command line for zone report using qemu-io is:

$ path/to/qemu-io --image-opts driver=zoned_host_device,filename=/dev/nullb0 -c 
"zrp offset nr_zones"

v6:
- drop virtio-blk emulation changes
- address Stefan's review comments
  * fix CONFIG_BLKZONED configs in related functions
  * replace reading fd by g_file_get_contents() in get_sysfs_str_val()
  * rewrite documentation for zoned storage

v5:
- add zoned storage emulation to virtio-blk device
- add documentation for zoned storage
- address review comments
  * fix qemu-iotests
  * fix check to block layer
  * modify interfaces of sysfs helper functions
  * rename zoned device structs according to QEMU styles
  * reorder patches

v4:
- add virtio-blk headers for zoned device
- add configurations for zoned host device
- add zone operations for raw-format
- address review comments
  * fix memory leak bug in zone_report
  * add checks to block layers
  * fix qemu-iotests format
  * fix sysfs helper functions

v3:
- add helper functions to get sysfs attributes
- address review comments
  * fix zone report bugs
  * fix the qemu-io code path
  * use thread pool to avoid blocking ioctl() calls

v2:
- add qemu-io sub-commands
- address review comments
  * modify interfaces of APIs

v1:
- add block layer APIs resembling Linux ZoneBlockDevice ioctls

Sam Li (8):
  include: add zoned device structs
  file-posix: introduce get_sysfs_long_val for the long sysfs attribute
  file-posix: introduce get_sysfs_str_val for device zoned model
  block: add block layer APIs resembling Linux ZonedBlockDevice ioctls
  raw-format: add zone operations to pass through requests
  config: add check to block layer
  qemu-iotests: test new zone operations
  docs/zoned-storage: add zoned device documentation

 block.c|  13 +
 block/block-backend.c  |  50 +++
 block/coroutines.h |   6 +
 block/file-posix.c | 423 -
 block/io.c |  41 +++
 block/raw-format.c |  14 +
 docs/devel/zoned-storage.rst   |  41 +++
 docs/system/qemu-block-drivers.rst.inc |   6 +
 include/block/block-common.h   |  44 ++-
 include/block/block-io.h   |  13 +
 include/block/block_int-common.h   |  35 +-
 include/block/raw-aio.h|   6 +-
 meson.build|   1 +
 qapi/block-core.json   |   8 +-
 qemu-io-cmds.c | 144 +
 tests/qemu-iotests/tests/zoned.out |  53 
 tests/qemu-iotests/tests/zoned.sh  |  86 +
 17 files changed, 964 insertions(+), 20 deletions(-)
 create mode 100644 docs/devel/zoned-storage.rst
 create mode 100644 tests/qemu-iotests/tests/zoned.out
 create mode 100755 tests/qemu-iotests/tests/zoned.sh

-- 
2.37.1