[PATCH v2 03/11] qcow2: put discard requests in the common queue when discard-no-unref enabled

2024-05-13 Thread Andrey Drobyshev
Normally discard requests are stored in the queue attached to BDRVQcow2State
to be processed later at once.  Currently discard-no-unref option handling
causes these requests to be processed straight away.  Let's fix that.

Note that when doing regular discards qcow2_free_any_cluster() would check
for the presence of external data files for us and redirect request to
underlying data_file.  Here we want to do the same but avoid refcount updates,
thus we perform the same checks.

Suggested-by: Hanna Czenczek 
Signed-off-by: Andrey Drobyshev 
---
 block/qcow2-cluster.c | 39 +--
 1 file changed, 29 insertions(+), 10 deletions(-)

diff --git a/block/qcow2-cluster.c b/block/qcow2-cluster.c
index 5f057ba2fd..7dff0bd5a1 100644
--- a/block/qcow2-cluster.c
+++ b/block/qcow2-cluster.c
@@ -1893,6 +1893,28 @@ again:
 return 0;
 }
 
+/*
+ * Helper for adding a discard request to the queue without any refcount
+ * modifications.  If external data file is used redirects the request to
+ * the corresponding BdrvChild.
+ */
+static inline void
+discard_no_unref_any_file(BlockDriverState *bs, uint64_t offset,
+  uint64_t length, QCow2ClusterType ctype,
+  enum qcow2_discard_type dtype)
+{
+BDRVQcow2State *s = bs->opaque;
+
+if (s->discard_passthrough[dtype] &&
+(ctype == QCOW2_CLUSTER_NORMAL || ctype == QCOW2_CLUSTER_ZERO_ALLOC)) {
+if (has_data_file(bs)) {
+bdrv_pdiscard(s->data_file, offset, length);
+} else {
+qcow2_queue_discard(bs, offset, length);
+}
+}
+}
+
 /*
  * This discards as many clusters of nb_clusters as possible at once (i.e.
  * all clusters in the same L2 slice) and returns the number of discarded
@@ -1974,12 +1996,10 @@ discard_in_l2_slice(BlockDriverState *bs, uint64_t 
offset, uint64_t nb_clusters,
 if (!keep_reference) {
 /* Then decrease the refcount */
 qcow2_free_any_cluster(bs, old_l2_entry, type);
-} else if (s->discard_passthrough[type] &&
-   (cluster_type == QCOW2_CLUSTER_NORMAL ||
-cluster_type == QCOW2_CLUSTER_ZERO_ALLOC)) {
+} else {
 /* If we keep the reference, pass on the discard still */
-bdrv_pdiscard(s->data_file, old_l2_entry & L2E_OFFSET_MASK,
-  s->cluster_size);
+discard_no_unref_any_file(bs, old_l2_entry & L2E_OFFSET_MASK,
+  s->cluster_size, cluster_type, type);
 }
 }
 
@@ -2088,12 +2108,11 @@ zero_in_l2_slice(BlockDriverState *bs, uint64_t offset,
 if (!keep_reference) {
 /* Then decrease the refcount */
 qcow2_free_any_cluster(bs, old_l2_entry, 
QCOW2_DISCARD_REQUEST);
-} else if (s->discard_passthrough[QCOW2_DISCARD_REQUEST] &&
-   (type == QCOW2_CLUSTER_NORMAL ||
-type == QCOW2_CLUSTER_ZERO_ALLOC)) {
+} else {
 /* If we keep the reference, pass on the discard still */
-bdrv_pdiscard(s->data_file, old_l2_entry & L2E_OFFSET_MASK,
-s->cluster_size);
+discard_no_unref_any_file(bs, old_l2_entry & L2E_OFFSET_MASK,
+  s->cluster_size, type,
+  QCOW2_DISCARD_REQUEST);
 }
 }
 }
-- 
2.39.3




[PATCH v2 06/11] iotests/290: add test case to check 'discard-no-unref' option behavior

2024-05-13 Thread Andrey Drobyshev
We basically fill 2 images with identical data and perform discard
operations with and without 'discard-no-unref' enabled.  Then we check
that images still read identically, that their disk usage is the same
(i.e. fallocate(FALLOC_FL_PUNCH_HOLE|FALLOC_FL_KEEP_SIZE) is called for
both) and that with the option enabled cluster is still marked as
allocated in "qemu-img map" output.  We also check that the option
doesn't work with qcow2 v2 images.

Signed-off-by: Andrey Drobyshev 
---
 tests/qemu-iotests/290 | 34 ++
 tests/qemu-iotests/290.out | 28 
 2 files changed, 62 insertions(+)

diff --git a/tests/qemu-iotests/290 b/tests/qemu-iotests/290
index 776b59de1b..4eb929d15f 100755
--- a/tests/qemu-iotests/290
+++ b/tests/qemu-iotests/290
@@ -92,6 +92,40 @@ for qcow2_compat in 0.10 1.1; do
 $QEMU_IMG map "$TEST_IMG" | _filter_testdir
 done
 
+echo
+echo "### Test 'qemu-io -c discard' with 'discard-no-unref' option enabled"
+echo
+
+echo "# Check that qcow2 v2 images don't support 'discard-no-unref' option"
+NOUNREF_IMG="$TEST_IMG.nounref"
+TEST_IMG="$NOUNREF_IMG" _make_test_img -o "compat=0.10" 128k
+# This should immediately fail with an error
+$QEMU_IO -c 'reopen -o discard-no-unref=on' "$NOUNREF_IMG" | _filter_qemu_io
+
+echo "# Create two compat=1.1 images and fill them with identical data"
+_make_test_img -o "compat=1.1" 128k
+TEST_IMG="$NOUNREF_IMG" _make_test_img -o "compat=1.1" 128k
+$QEMU_IO -c 'write -P 0xaa 0 128k' "$TEST_IMG" | _filter_qemu_io
+$QEMU_IO -c 'write -P 0xaa 0 128k' "$NOUNREF_IMG" | _filter_qemu_io
+
+echo "# Enable 'discard-no-unref' in one of them, discard 2nd cluster in both"
+$QEMU_IO -c 'discard 64k 64k' "$TEST_IMG" | _filter_qemu_io
+$QEMU_IO -c 'reopen -o discard-no-unref=on' \
+ -c 'discard 64k 64k' "$NOUNREF_IMG" | _filter_qemu_io
+
+echo "# Compare disk usage of the 2 images"
+# Don't check the exact disk usage values but rather that they're equal
+echo "disk_usage(`basename $TEST_IMG`) - disk_usage(`basename $NOUNREF_IMG`)" \
+ "= $(( `disk_usage $TEST_IMG` - `disk_usage $NOUNREF_IMG`))"
+
+echo "# Check that images are still identical"
+$QEMU_IMG compare "$TEST_IMG" "$NOUNREF_IMG"
+
+echo "# Output of qemu-img map for the image with dropped reference"
+$QEMU_IMG map --output=json "$TEST_IMG" | _filter_qemu_img_map
+echo "# Output of qemu-img map for the image with kept reference"
+$QEMU_IMG map --output=json "$NOUNREF_IMG" | _filter_qemu_img_map
+
 # success, all done
 echo "*** done"
 rm -f $seq.full
diff --git a/tests/qemu-iotests/290.out b/tests/qemu-iotests/290.out
index 22b476594f..f790feae81 100644
--- a/tests/qemu-iotests/290.out
+++ b/tests/qemu-iotests/290.out
@@ -58,4 +58,32 @@ read 131072/131072 bytes at offset 0
 128 KiB, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec)
 # Output of qemu-img map
 Offset  Length  Mapped to   File
+
+### Test 'qemu-io -c discard' with 'discard-no-unref' option enabled
+
+# Check that qcow2 v2 images don't support 'discard-no-unref' option
+Formatting 'TEST_DIR/t.IMGFMT.nounref', fmt=IMGFMT size=131072
+qemu-io: discard-no-unref is only supported since qcow2 version 3
+# Create two compat=1.1 images and fill them with identical data
+Formatting 'TEST_DIR/t.IMGFMT', fmt=IMGFMT size=131072
+Formatting 'TEST_DIR/t.IMGFMT.nounref', fmt=IMGFMT size=131072
+wrote 131072/131072 bytes at offset 0
+128 KiB, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec)
+wrote 131072/131072 bytes at offset 0
+128 KiB, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec)
+# Enable 'discard-no-unref' in one of them, discard 2nd cluster in both
+discard 65536/65536 bytes at offset 65536
+64 KiB, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec)
+discard 65536/65536 bytes at offset 65536
+64 KiB, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec)
+# Compare disk usage of the 2 images
+disk_usage(t.qcow2) - disk_usage(t.qcow2.nounref) = 0
+# Check that images are still identical
+Images are identical.
+# Output of qemu-img map for the image with dropped reference
+[{ "start": 0, "length": 65536, "depth": 0, "present": true, "zero": false, 
"data": true, "compressed": false, "offset": OFFSET},
+{ "start": 65536, "length": 65536, "depth": 0, "present": true, "zero": true, 
"data": false, "compressed": false}]
+# Output of qemu-img map for the image with kept reference
+[{ "start": 0, "length": 65536, "depth": 0, "present": true, "zero": false, 
"data": true, "compressed": false, "offset": OFFSET},
+{ "start": 65536, "length": 65536, "depth": 0, "present": true, "zero": true, 
"data": false, "compressed": false, "offset": OFFSET}]
 *** done
-- 
2.39.3




[PATCH v2 00/11] qcow2: make subclusters discardable

2024-05-13 Thread Andrey Drobyshev
v1: https://lists.nongnu.org/archive/html/qemu-devel/2023-10/msg07223.html

Andrey Drobyshev (11):
  qcow2: make function update_refcount_discard() global
  qcow2: simplify L2 entries accounting for discard-no-unref
  qcow2: put discard requests in the common queue when discard-no-unref
enabled
  block/file-posix: add trace event for fallocate() calls
  iotests/common.rc: add disk_usage function
  iotests/290: add test case to check 'discard-no-unref' option behavior
  qcow2: add get_sc_range_info() helper for working with subcluster
ranges
  qcow2: zeroize the entire cluster when there're no non-zero
subclusters
  qcow2: make subclusters discardable
  qcow2: zero_l2_subclusters: fall through to discard operation when
requested
  iotests/271: add test cases for subcluster-based discard/unmap

 block/file-posix.c   |   1 +
 block/qcow2-cluster.c| 346 ---
 block/qcow2-refcount.c   |   8 +-
 block/qcow2-snapshot.c   |   6 +-
 block/qcow2.c|  25 +--
 block/qcow2.h|   6 +-
 block/trace-events   |   1 +
 tests/qemu-iotests/250   |   5 -
 tests/qemu-iotests/271   |  72 ++--
 tests/qemu-iotests/271.out   |  69 ++-
 tests/qemu-iotests/290   |  34 
 tests/qemu-iotests/290.out   |  28 +++
 tests/qemu-iotests/common.rc |   6 +
 13 files changed, 490 insertions(+), 117 deletions(-)

-- 
2.39.3




[PATCH v2 11/11] iotests/271: add test cases for subcluster-based discard/unmap

2024-05-13 Thread Andrey Drobyshev
Add a bunch of test cases covering new subclusters behaviour: unmap of
last allocated subclusters; unmap of subclusters within unallocated
cluster; discard of unallocated subclusters within a cluster; regular discard
of subclusters within a cluster; discard of last allocated subclusters.

Also make all discard/unmap operations enable trace point 'file_do_fallocate'
so that actual fallocate() calls are visible.

Signed-off-by: Andrey Drobyshev 
---
 tests/qemu-iotests/271 | 70 +-
 tests/qemu-iotests/271.out | 69 ++---
 2 files changed, 119 insertions(+), 20 deletions(-)

diff --git a/tests/qemu-iotests/271 b/tests/qemu-iotests/271
index 04c57813c2..8b80682cff 100755
--- a/tests/qemu-iotests/271
+++ b/tests/qemu-iotests/271
@@ -81,6 +81,12 @@ _verify_l2_bitmap()
 fi
 }
 
+# Filter out trace params which we don't control (file fd)
+_filter_trace_fallocate()
+{
+sed -E 's/fd=[0-9]+/fd=N/g'
+}
+
 # This should be called as _run_test c=XXX sc=XXX off=XXX len=XXX cmd=XXX
 # c:   cluster number (0 if unset)
 # sc:  subcluster number inside cluster @c (0 if unset)
@@ -94,12 +100,13 @@ _verify_l2_bitmap()
 #  discard  -> discard
 _run_test()
 {
-unset c sc off len cmd
+unset c sc off len cmd opt
 for var in "$@"; do eval "$var"; done
 case "${cmd:-write}" in
 zero)
 cmd="write -q -z";;
 unmap)
+opt="--trace enable=file_do_fallocate"
 cmd="write -q -z -u";;
 compress)
 pat=$((${pat:-0} + 1))
@@ -108,6 +115,7 @@ _run_test()
 pat=$((${pat:-0} + 1))
 cmd="write -q -P ${pat}";;
 discard)
+opt="--trace enable=file_do_fallocate"
 cmd="discard -q";;
 *)
 echo "Unknown option $cmd"
@@ -121,7 +129,7 @@ _run_test()
 cmd="$cmd ${offset} ${len}"
 raw_cmd=$(echo $cmd | sed s/-c//) # Raw images don't support -c
 echo $cmd | sed 's/-P [0-9][0-9]\?/-P PATTERN/'
-$QEMU_IO -c "$cmd" "$TEST_IMG" | _filter_qemu_io
+$QEMU_IO $opt -c "$cmd" "$TEST_IMG" 2>&1 | _filter_qemu_io | 
_filter_trace_fallocate
 $QEMU_IO -c "$raw_cmd" -f raw "$TEST_IMG.raw" | _filter_qemu_io
 _verify_img
 _verify_l2_bitmap "$c"
@@ -202,9 +210,10 @@ for use_backing_file in yes no; do
 alloc="$(seq 16 31)"; zero="$(seq 0 15)"
 _run_test sc=0 len=32k cmd=unmap
 
-### Zero and unmap cluster #0
+### Zero and unmap second half of cluster #0 (this will unmap it and
+### discard l2 entry)
 alloc=""; zero="$(seq 0 31)"
-_run_test sc=0 len=64k cmd=unmap
+_run_test sc=16 len=32k cmd=unmap
 
 ### Write subcluster #1 (middle of subcluster)
 alloc="1"; zero="0 $(seq 2 31)"
@@ -439,6 +448,12 @@ for use_backing_file in yes no; do
 _verify_l2_bitmap 16
 _verify_l2_bitmap 17
 
+# Unmap subclusters #0-#3 of an unallocated cluster #8.  Since
+# 'write -z -u' doesn't lead to full discard, subclusters should become
+# explicitly zeroized
+alloc=""; zero="$(seq 0 3)"
+_run_test c=8 sc=0 len=8k cmd=unmap
+
 # Cluster-aligned request from clusters #9 to #11
 alloc=""; zero="$(seq 0 31)"
 _run_test c=9 sc=0 len=192k cmd=unmap
@@ -523,26 +538,45 @@ for use_backing_file in yes no; do
 echo
 echo "### Discarding clusters with non-zero bitmaps (backing file: 
$use_backing_file) ###"
 echo
+
+_reset_img 1M
+
+# Write first half of cluster #0 and discard another half
+alloc="$(seq 0 15)"; zero=""
+_run_test sc=0 len=32k
+# When discarding unallocated subclusters, they only have to be
+# explicitly zeroized when the image has a backing file
 if [ "$use_backing_file" = "yes" ]; then
-_make_test_img -o extended_l2=on -F raw -b "$TEST_IMG.base" 1M
+alloc="$(seq 0 15)"; zero="$(seq 16 31)"
 else
-_make_test_img -o extended_l2=on 1M
+alloc="$(seq 0 15)"; zero=""
 fi
-# Write clusters #0-#2 and then discard them
-$QEMU_IO -c 'write -q 0 128k' "$TEST_IMG"
-$QEMU_IO -c 'discard -q 0 128k' "$TEST_IMG"
+_run_test sc=16 len=32k cmd=discard
+
+# Write cluster #0 and discard its subclusters #0-#3
+alloc="$(seq 0 31)"; zero=""
+_run_test sc=0 len=64k
+alloc="$(seq 4 31)"; zero="$(seq 0 3)"
+_run_test sc=0 len=8k cmd=discard
+
+# Discard remaining subclusters #4-#32.  This should unmap the cluster
+# and discard its l2 entry
+alloc=""; zero="$(seq 0 31)"
+_r

[PATCH v2 07/11] qcow2: add get_sc_range_info() helper for working with subcluster ranges

2024-05-13 Thread Andrey Drobyshev
This helper simply obtains the l2 table parameters of the cluster which
contains the given subclusters range.  Right now this info is being
obtained and used by zero_l2_subclusters().  As we're about to introduce
the subclusters discard operation, this helper would let us avoid code
duplication.

Also introduce struct SubClusterRangeInfo, which would contain all the
needed params.

Signed-off-by: Andrey Drobyshev 
---
 block/qcow2-cluster.c | 140 --
 1 file changed, 108 insertions(+), 32 deletions(-)

diff --git a/block/qcow2-cluster.c b/block/qcow2-cluster.c
index 7dff0bd5a1..475f167035 100644
--- a/block/qcow2-cluster.c
+++ b/block/qcow2-cluster.c
@@ -1915,6 +1915,103 @@ discard_no_unref_any_file(BlockDriverState *bs, 
uint64_t offset,
 }
 }
 
+/*
+ * Structure containing info about the subclusters range within one cluster.
+ *
+ * Since @l2_slice is a strong reference to the l2 table slice containing
+ * the corresponding l2 entry, it must be explicitly released by
+ * qcow2_cache_put().  Thus the user must either declare it with g_auto()
+ * (in which case sc_range_info_cleanup() is called automatically) or do
+ * the cleanup themselves.
+ */
+typedef struct SubClusterRangeInfo {
+uint64_t *l2_slice;
+int l2_index;
+uint64_t l2_entry;
+uint64_t l2_bitmap;
+QCow2ClusterType ctype;
+Qcow2Cache *l2_table_cache;
+} SubClusterRangeInfo;
+
+static void sc_range_info_cleanup(SubClusterRangeInfo *scri)
+{
+if (scri->l2_table_cache && scri->l2_slice) {
+qcow2_cache_put(scri->l2_table_cache, (void **) >l2_slice);
+}
+}
+
+G_DEFINE_AUTO_CLEANUP_CLEAR_FUNC(SubClusterRangeInfo, sc_range_info_cleanup);
+
+/*
+ * For a given @offset and @nb_subclusters, fill out the SubClusterRangeInfo
+ * structure describing the subclusters range and referred to by @scri.
+ * Only the subclusters which can be independently discarded/zeroized
+ * (i.e. not compressed or invalid) are considered to be valid here.
+ *
+ * The subclusters range is denoted by @offset and @nb_subclusters and must
+ * not cross the cluster boundary.  @offset must be aligned to the subcluster
+ * size.
+ *
+ * Return: 0 if the SubClusterRangeInfo is successfully filled out and the
+ * subclusters within the given range might be discarded/zeroized;
+ * -EINVAL if any of the subclusters within the range is invalid;
+ * -ENOTSUP if the range is contained within a compressed cluster.
+ */
+static int GRAPH_RDLOCK
+get_sc_range_info(BlockDriverState *bs, uint64_t offset,
+  unsigned nb_subclusters, SubClusterRangeInfo *scri)
+{
+BDRVQcow2State *s = bs->opaque;
+int ret, sc_cleared, sc_index = offset_to_sc_index(s, offset);
+QCow2SubclusterType sctype;
+
+/* Here we only work with the subclusters within single cluster. */
+assert(nb_subclusters > 0 && nb_subclusters < s->subclusters_per_cluster);
+assert(sc_index + nb_subclusters <= s->subclusters_per_cluster);
+assert(offset_into_subcluster(s, offset) == 0);
+
+scri->l2_table_cache = s->l2_table_cache;
+
+ret = get_cluster_table(bs, offset, >l2_slice, >l2_index);
+if (ret < 0) {
+goto cleanup;
+}
+
+scri->l2_entry = get_l2_entry(s, scri->l2_slice, scri->l2_index);
+scri->l2_bitmap = get_l2_bitmap(s, scri->l2_slice, scri->l2_index);
+scri->ctype = qcow2_get_cluster_type(bs, scri->l2_entry);
+
+sc_cleared = 0;
+do {
+ret = qcow2_get_subcluster_range_type(
+bs, scri->l2_entry, scri->l2_bitmap, sc_index + sc_cleared,
+);
+if (ret < 0) {
+goto cleanup;
+}
+
+switch (sctype) {
+case QCOW2_SUBCLUSTER_COMPRESSED:
+/* We cannot partially zeroize/discard compressed clusters. */
+ret = -ENOTSUP;
+goto cleanup;
+case QCOW2_SUBCLUSTER_INVALID:
+ret = -EINVAL;
+goto cleanup;
+default:
+break;
+}
+
+sc_cleared += ret;
+} while (sc_cleared < nb_subclusters);
+
+return 0;
+
+cleanup:
+sc_range_info_cleanup(scri);
+return ret;
+}
+
 /*
  * This discards as many clusters of nb_clusters as possible at once (i.e.
  * all clusters in the same L2 slice) and returns the number of discarded
@@ -2127,46 +2224,25 @@ zero_l2_subclusters(BlockDriverState *bs, uint64_t 
offset,
 unsigned nb_subclusters)
 {
 BDRVQcow2State *s = bs->opaque;
-uint64_t *l2_slice;
-uint64_t old_l2_bitmap, l2_bitmap;
-int l2_index, ret, sc = offset_to_sc_index(s, offset);
-
-/* For full clusters use zero_in_l2_slice() instead */
-assert(nb_subclusters > 0 && nb_subclusters < s->subclusters_per_cluster);
-assert(sc + nb_subclusters <= s->subclusters_per_cluster);
-assert(offset_into_subcluster(s, offset) == 0);
+uint64_t

[PATCH v2 05/11] iotests/common.rc: add disk_usage function

2024-05-13 Thread Andrey Drobyshev
Move the definition from iotests/250 to common.rc.  This is used to
detect real disk usage of sparse files.  In particular, we want to use
it for checking subclusters-based discards.

Signed-off-by: Andrey Drobyshev 
---
 tests/qemu-iotests/250   | 5 -
 tests/qemu-iotests/common.rc | 6 ++
 2 files changed, 6 insertions(+), 5 deletions(-)

diff --git a/tests/qemu-iotests/250 b/tests/qemu-iotests/250
index af48f83aba..c0a0dbc0ff 100755
--- a/tests/qemu-iotests/250
+++ b/tests/qemu-iotests/250
@@ -52,11 +52,6 @@ _unsupported_imgopts data_file
 # bdrv_co_truncate(bs->file) call in qcow2_co_truncate(), which might succeed
 # anyway.
 
-disk_usage()
-{
-du --block-size=1 $1 | awk '{print $1}'
-}
-
 size=2100M
 
 _make_test_img -o "cluster_size=1M,preallocation=metadata" $size
diff --git a/tests/qemu-iotests/common.rc b/tests/qemu-iotests/common.rc
index 95c12577dd..237f746af8 100644
--- a/tests/qemu-iotests/common.rc
+++ b/tests/qemu-iotests/common.rc
@@ -140,6 +140,12 @@ _optstr_add()
 fi
 }
 
+# report real disk usage for sparse files
+disk_usage()
+{
+du --block-size=1 "$1" | awk '{print $1}'
+}
+
 # Set the variables to the empty string to turn Valgrind off
 # for specific processes, e.g.
 # $ VALGRIND_QEMU_IO= ./check -qcow2 -valgrind 015
-- 
2.39.3




[PATCH v2 09/11] qcow2: make subclusters discardable

2024-05-13 Thread Andrey Drobyshev
This commit makes the discard operation work on the subcluster level
rather than cluster level.  It introduces discard_l2_subclusters()
function and makes use of it in qcow2 discard implementation, much like
it's done with zero_in_l2_slice() / zero_l2_subclusters().  It also
changes the qcow2 driver pdiscard_alignment to subcluster_size.  That
way subcluster-aligned discards lead to actual fallocate(PUNCH_HOLE)
operation and free host disk space.

This feature will let us gain additional disk space on guest
TRIM/discard requests, especially when using large enough clusters
(1M, 2M) with subclusters enabled.

Also rename qcow2_cluster_discard() -> qcow2_subcluster_discard() to
reflect the change.

Signed-off-by: Andrey Drobyshev 
---
 block/qcow2-cluster.c  | 106 +
 block/qcow2-snapshot.c |   6 +--
 block/qcow2.c  |  25 +-
 block/qcow2.h  |   4 +-
 tests/qemu-iotests/271 |   2 +-
 5 files changed, 117 insertions(+), 26 deletions(-)

diff --git a/block/qcow2-cluster.c b/block/qcow2-cluster.c
index 8d39e2f960..3c134a7e80 100644
--- a/block/qcow2-cluster.c
+++ b/block/qcow2-cluster.c
@@ -2105,25 +2105,106 @@ discard_in_l2_slice(BlockDriverState *bs, uint64_t 
offset, uint64_t nb_clusters,
 return nb_clusters;
 }
 
-int qcow2_cluster_discard(BlockDriverState *bs, uint64_t offset,
-  uint64_t bytes, enum qcow2_discard_type type,
-  bool full_discard)
+static int coroutine_fn GRAPH_RDLOCK
+discard_l2_subclusters(BlockDriverState *bs, uint64_t offset,
+   uint64_t nb_subclusters,
+   enum qcow2_discard_type type,
+   bool full_discard)
+{
+BDRVQcow2State *s = bs->opaque;
+uint64_t new_l2_bitmap, bitmap_alloc_mask, bitmap_zero_mask;
+int ret, sc = offset_to_sc_index(s, offset);
+g_auto(SubClusterRangeInfo) scri = { 0 };
+
+ret = get_sc_range_info(bs, offset, nb_subclusters, );
+if (ret < 0) {
+return ret;
+}
+
+new_l2_bitmap = scri.l2_bitmap;
+bitmap_alloc_mask = QCOW_OFLAG_SUB_ALLOC_RANGE(sc, sc + nb_subclusters);
+bitmap_zero_mask = QCOW_OFLAG_SUB_ZERO_RANGE(sc, sc + nb_subclusters);
+
+new_l2_bitmap &= ~bitmap_alloc_mask;
+
+/*
+ * Full discard means we fall through to the backing file, thus we need
+ * to mark the subclusters as deallocated and clear the corresponding
+ * zero bits.
+ *
+ * Non-full discard means subclusters should be explicitly marked as
+ * zeroes.  In this case QCOW2 specification requires the corresponding
+ * allocation status bits to be unset as well.  If the subclusters are
+ * deallocated in the first place and there's no backing, the operation
+ * can be skipped.
+ */
+if (full_discard) {
+new_l2_bitmap &= ~bitmap_zero_mask;
+} else if (bs->backing || scri.l2_bitmap & bitmap_alloc_mask) {
+new_l2_bitmap |= bitmap_zero_mask;
+}
+
+/*
+ * If after discarding this range there won't be any allocated subclusters
+ * left, and new bitmap becomes the same as it'd be after discarding the
+ * whole cluster, we better discard it entirely.  That way we'd also
+ * update the refcount table.
+ */
+if ((full_discard && new_l2_bitmap == 0) ||
+(!full_discard && new_l2_bitmap == QCOW_L2_BITMAP_ALL_ZEROES)) {
+return discard_in_l2_slice(
+bs, QEMU_ALIGN_DOWN(offset, s->cluster_size),
+1, type, full_discard);
+}
+
+if (scri.l2_bitmap != new_l2_bitmap) {
+set_l2_bitmap(s, scri.l2_slice, scri.l2_index, new_l2_bitmap);
+qcow2_cache_entry_mark_dirty(s->l2_table_cache, scri.l2_slice);
+}
+
+discard_no_unref_any_file(
+bs, (scri.l2_entry & L2E_OFFSET_MASK) + offset_into_cluster(s, offset),
+nb_subclusters * s->subcluster_size, scri.ctype, type);
+
+return 0;
+}
+
+int qcow2_subcluster_discard(BlockDriverState *bs, uint64_t offset,
+ uint64_t bytes, enum qcow2_discard_type type,
+ bool full_discard)
 {
 BDRVQcow2State *s = bs->opaque;
 uint64_t end_offset = offset + bytes;
 uint64_t nb_clusters;
+unsigned head, tail;
 int64_t cleared;
 int ret;
 
 /* Caller must pass aligned values, except at image end */
-assert(QEMU_IS_ALIGNED(offset, s->cluster_size));
-assert(QEMU_IS_ALIGNED(end_offset, s->cluster_size) ||
+assert(QEMU_IS_ALIGNED(offset, s->subcluster_size));
+assert(QEMU_IS_ALIGNED(end_offset, s->subcluster_size) ||
end_offset == bs->total_sectors << BDRV_SECTOR_BITS);
 
-nb_clusters = size_to_clusters(s, bytes);
+head = MIN(end_offset, ROUND_UP(offset, s->cluster_size)) - offset;
+offset += head;
+
+tail = (end_offset >= bs->total_sectors << BDRV_SECTOR_BITS) ? 0 :
+ 

[PATCH v2 02/11] qcow2: simplify L2 entries accounting for discard-no-unref

2024-05-13 Thread Andrey Drobyshev
Commits 42a2890a and b2b10904 introduce handling of discard-no-unref
option in discard_in_l2_slice() and zero_in_l2_slice().  They add even
more if's when chosing the right l2 entry.  What we really need for this
option is the new entry simply to contain the same host cluster offset,
no matter whether we unmap or zeroize the cluster.  For that OR'ing with
the old entry is enough.

This patch doesn't change the logic and is pure refactoring.

Signed-off-by: Andrey Drobyshev 
---
 block/qcow2-cluster.c | 34 +++---
 1 file changed, 15 insertions(+), 19 deletions(-)

diff --git a/block/qcow2-cluster.c b/block/qcow2-cluster.c
index ce8c0076b3..5f057ba2fd 100644
--- a/block/qcow2-cluster.c
+++ b/block/qcow2-cluster.c
@@ -1946,25 +1946,21 @@ discard_in_l2_slice(BlockDriverState *bs, uint64_t 
offset, uint64_t nb_clusters,
 new_l2_entry = new_l2_bitmap = 0;
 } else if (bs->backing || qcow2_cluster_is_allocated(cluster_type)) {
 if (has_subclusters(s)) {
-if (keep_reference) {
-new_l2_entry = old_l2_entry;
-} else {
-new_l2_entry = 0;
-}
+new_l2_entry = 0;
 new_l2_bitmap = QCOW_L2_BITMAP_ALL_ZEROES;
 } else {
-if (s->qcow_version >= 3) {
-if (keep_reference) {
-new_l2_entry |= QCOW_OFLAG_ZERO;
-} else {
-new_l2_entry = QCOW_OFLAG_ZERO;
-}
-} else {
-new_l2_entry = 0;
-}
+new_l2_entry = s->qcow_version >= 3 ? QCOW_OFLAG_ZERO : 0;
 }
 }
 
+/*
+ * No need to check for the QCOW version since discard-no-unref is
+ * only allowed since version 3.
+ */
+if (keep_reference) {
+new_l2_entry |= old_l2_entry;
+}
+
 if (old_l2_entry == new_l2_entry && old_l2_bitmap == new_l2_bitmap) {
 continue;
 }
@@ -2064,19 +2060,19 @@ zero_in_l2_slice(BlockDriverState *bs, uint64_t offset,
 ((flags & BDRV_REQ_MAY_UNMAP) && qcow2_cluster_is_allocated(type));
 bool keep_reference =
 (s->discard_no_unref && type != QCOW2_CLUSTER_COMPRESSED);
-uint64_t new_l2_entry = old_l2_entry;
+uint64_t new_l2_entry = unmap ? 0 : old_l2_entry;
 uint64_t new_l2_bitmap = old_l2_bitmap;
 
-if (unmap && !keep_reference) {
-new_l2_entry = 0;
-}
-
 if (has_subclusters(s)) {
 new_l2_bitmap = QCOW_L2_BITMAP_ALL_ZEROES;
 } else {
 new_l2_entry |= QCOW_OFLAG_ZERO;
 }
 
+if (keep_reference) {
+new_l2_entry |= old_l2_entry;
+}
+
 if (old_l2_entry == new_l2_entry && old_l2_bitmap == new_l2_bitmap) {
 continue;
 }
-- 
2.39.3




[PATCH v2 04/11] block/file-posix: add trace event for fallocate() calls

2024-05-13 Thread Andrey Drobyshev
This would ease debugging of write zeroes and discard operations.

Signed-off-by: Andrey Drobyshev 
---
 block/file-posix.c | 1 +
 block/trace-events | 1 +
 2 files changed, 2 insertions(+)

diff --git a/block/file-posix.c b/block/file-posix.c
index 35684f7e21..45134f0eef 100644
--- a/block/file-posix.c
+++ b/block/file-posix.c
@@ -1859,6 +1859,7 @@ static int translate_err(int err)
 static int do_fallocate(int fd, int mode, off_t offset, off_t len)
 {
 do {
+trace_file_do_fallocate(fd, mode, offset, len);
 if (fallocate(fd, mode, offset, len) == 0) {
 return 0;
 }
diff --git a/block/trace-events b/block/trace-events
index 8e789e1f12..2f7ad28996 100644
--- a/block/trace-events
+++ b/block/trace-events
@@ -203,6 +203,7 @@ curl_setup_preadv(uint64_t bytes, uint64_t start, const 
char *range) "reading %"
 curl_close(void) "close"
 
 # file-posix.c
+file_do_fallocate(int fd, int mode, int64_t offset, int64_t len) "fd=%d 
mode=0x%02x offset=%" PRIi64 " len=%" PRIi64
 file_copy_file_range(void *bs, int src, int64_t src_off, int dst, int64_t 
dst_off, int64_t bytes, int flags, int64_t ret) "bs %p src_fd %d offset 
%"PRIu64" dst_fd %d offset %"PRIu64" bytes %"PRIu64" flags %d ret %"PRId64
 file_FindEjectableOpticalMedia(const char *media) "Matching using %s"
 file_setup_cdrom(const char *partition) "Using %s as optical disc"
-- 
2.39.3




[PATCH v2 10/11] qcow2: zero_l2_subclusters: fall through to discard operation when requested

2024-05-13 Thread Andrey Drobyshev
When zeroizing subclusters within single cluster, detect usage of the
BDRV_REQ_MAY_UNMAP flag and fall through to the subcluster-based discard
operation, much like it's done with the cluster-based discards.  That
way subcluster-aligned operations "qemu-io -c 'write -z -u ...'" will
lead to actual unmap.

Signed-off-by: Andrey Drobyshev 
---
 block/qcow2-cluster.c | 26 --
 1 file changed, 20 insertions(+), 6 deletions(-)

diff --git a/block/qcow2-cluster.c b/block/qcow2-cluster.c
index 3c134a7e80..53e04eff93 100644
--- a/block/qcow2-cluster.c
+++ b/block/qcow2-cluster.c
@@ -2107,15 +2107,16 @@ discard_in_l2_slice(BlockDriverState *bs, uint64_t 
offset, uint64_t nb_clusters,
 
 static int coroutine_fn GRAPH_RDLOCK
 discard_l2_subclusters(BlockDriverState *bs, uint64_t offset,
-   uint64_t nb_subclusters,
-   enum qcow2_discard_type type,
-   bool full_discard)
+   uint64_t nb_subclusters, enum qcow2_discard_type type,
+   bool full_discard, bool uncond_zeroize)
 {
 BDRVQcow2State *s = bs->opaque;
 uint64_t new_l2_bitmap, bitmap_alloc_mask, bitmap_zero_mask;
 int ret, sc = offset_to_sc_index(s, offset);
 g_auto(SubClusterRangeInfo) scri = { 0 };
 
+assert(!(full_discard && uncond_zeroize));
+
 ret = get_sc_range_info(bs, offset, nb_subclusters, );
 if (ret < 0) {
 return ret;
@@ -2140,7 +2141,8 @@ discard_l2_subclusters(BlockDriverState *bs, uint64_t 
offset,
  */
 if (full_discard) {
 new_l2_bitmap &= ~bitmap_zero_mask;
-} else if (bs->backing || scri.l2_bitmap & bitmap_alloc_mask) {
+} else if (uncond_zeroize || bs->backing ||
+   scri.l2_bitmap & bitmap_alloc_mask) {
 new_l2_bitmap |= bitmap_zero_mask;
 }
 
@@ -2197,7 +2199,7 @@ int qcow2_subcluster_discard(BlockDriverState *bs, 
uint64_t offset,
 if (head) {
 ret = discard_l2_subclusters(bs, offset - head,
  size_to_subclusters(s, head), type,
- full_discard);
+ full_discard, false);
 if (ret < 0) {
 goto fail;
 }
@@ -2221,7 +2223,7 @@ int qcow2_subcluster_discard(BlockDriverState *bs, 
uint64_t offset,
 if (tail) {
 ret = discard_l2_subclusters(bs, end_offset,
  size_to_subclusters(s, tail), type,
- full_discard);
+ full_discard, false);
 if (ret < 0) {
 goto fail;
 }
@@ -2318,6 +2320,18 @@ zero_l2_subclusters(BlockDriverState *bs, uint64_t 
offset,
 int ret, sc = offset_to_sc_index(s, offset);
 g_auto(SubClusterRangeInfo) scri = { 0 };
 
+/*
+ * If the request allows discarding subclusters we go down the discard
+ * path regardless of their allocation status.  After the discard
+ * operation with full_discard=false subclusters are going to be read as
+ * zeroes anyway.  But we make sure that subclusters are explicitly
+ * zeroed anyway with uncond_zeroize=true.
+ */
+if (flags & BDRV_REQ_MAY_UNMAP) {
+return discard_l2_subclusters(bs, offset, nb_subclusters,
+  QCOW2_DISCARD_REQUEST, false, true);
+}
+
 ret = get_sc_range_info(bs, offset, nb_subclusters, );
 if (ret < 0) {
 return ret;
-- 
2.39.3




[PATCH v2 01/11] qcow2: make function update_refcount_discard() global

2024-05-13 Thread Andrey Drobyshev
We are going to need it for discarding separate subclusters.  The
function itself doesn't do anything with the refcount tables, it simply
adds a discard request to the queue, so rename it to qcow2_queue_discard().

Signed-off-by: Andrey Drobyshev 
Reviewed-by: Hanna Czenczek 
---
 block/qcow2-refcount.c | 8 
 block/qcow2.h  | 2 ++
 2 files changed, 6 insertions(+), 4 deletions(-)

diff --git a/block/qcow2-refcount.c b/block/qcow2-refcount.c
index 0266542cee..2026f5fa21 100644
--- a/block/qcow2-refcount.c
+++ b/block/qcow2-refcount.c
@@ -754,8 +754,8 @@ void qcow2_process_discards(BlockDriverState *bs, int ret)
 }
 }
 
-static void update_refcount_discard(BlockDriverState *bs,
-uint64_t offset, uint64_t length)
+void qcow2_queue_discard(BlockDriverState *bs, uint64_t offset,
+ uint64_t length)
 {
 BDRVQcow2State *s = bs->opaque;
 Qcow2DiscardRegion *d, *p, *next;
@@ -902,7 +902,7 @@ update_refcount(BlockDriverState *bs, int64_t offset, 
int64_t length,
 }
 
 if (s->discard_passthrough[type]) {
-update_refcount_discard(bs, cluster_offset, s->cluster_size);
+qcow2_queue_discard(bs, cluster_offset, s->cluster_size);
 }
 }
 }
@@ -3619,7 +3619,7 @@ qcow2_discard_refcount_block(BlockDriverState *bs, 
uint64_t discard_block_offs)
 /* discard refblock from the cache if refblock is cached */
 qcow2_cache_discard(s->refcount_block_cache, refblock);
 }
-update_refcount_discard(bs, discard_block_offs, s->cluster_size);
+qcow2_queue_discard(bs, discard_block_offs, s->cluster_size);
 
 return 0;
 }
diff --git a/block/qcow2.h b/block/qcow2.h
index a9e3481c6e..197bdcdf53 100644
--- a/block/qcow2.h
+++ b/block/qcow2.h
@@ -891,6 +891,8 @@ int coroutine_fn qcow2_check_refcounts(BlockDriverState 
*bs, BdrvCheckResult *re
BdrvCheckMode fix);
 
 void GRAPH_RDLOCK qcow2_process_discards(BlockDriverState *bs, int ret);
+void qcow2_queue_discard(BlockDriverState *bs, uint64_t offset,
+ uint64_t length);
 
 int GRAPH_RDLOCK
 qcow2_check_metadata_overlap(BlockDriverState *bs, int ign, int64_t offset,
-- 
2.39.3




[PATCH v2 08/11] qcow2: zeroize the entire cluster when there're no non-zero subclusters

2024-05-13 Thread Andrey Drobyshev
When zeroizing the last non-zero subclusters within single cluster, it
makes sense to go zeroize the entire cluster and go down zero_in_l2_slice()
path right away.  That way we'd also update the corresponding refcount
table.

Signed-off-by: Andrey Drobyshev 
Reviewed-by: Hanna Czenczek 
---
 block/qcow2-cluster.c | 17 ++---
 1 file changed, 14 insertions(+), 3 deletions(-)

diff --git a/block/qcow2-cluster.c b/block/qcow2-cluster.c
index 475f167035..8d39e2f960 100644
--- a/block/qcow2-cluster.c
+++ b/block/qcow2-cluster.c
@@ -2221,7 +2221,7 @@ zero_in_l2_slice(BlockDriverState *bs, uint64_t offset,
 
 static int coroutine_fn GRAPH_RDLOCK
 zero_l2_subclusters(BlockDriverState *bs, uint64_t offset,
-unsigned nb_subclusters)
+unsigned nb_subclusters, int flags)
 {
 BDRVQcow2State *s = bs->opaque;
 uint64_t new_l2_bitmap;
@@ -2237,6 +2237,16 @@ zero_l2_subclusters(BlockDriverState *bs, uint64_t 
offset,
 new_l2_bitmap |=  QCOW_OFLAG_SUB_ZERO_RANGE(sc, sc + nb_subclusters);
 new_l2_bitmap &= ~QCOW_OFLAG_SUB_ALLOC_RANGE(sc, sc + nb_subclusters);
 
+/*
+ * If there're no non-zero subclusters left, we might as well zeroize
+ * the entire cluster.  That way we'd also update the refcount table.
+ */
+if ((new_l2_bitmap & QCOW_L2_BITMAP_ALL_ZEROES) ==
+QCOW_L2_BITMAP_ALL_ZEROES) {
+return zero_in_l2_slice(bs, QEMU_ALIGN_DOWN(offset, s->cluster_size),
+1, flags);
+}
+
 if (new_l2_bitmap != scri.l2_bitmap) {
 set_l2_bitmap(s, scri.l2_slice, scri.l2_index, new_l2_bitmap);
 qcow2_cache_entry_mark_dirty(s->l2_table_cache, scri.l2_slice);
@@ -2293,7 +2303,7 @@ int coroutine_fn 
qcow2_subcluster_zeroize(BlockDriverState *bs, uint64_t offset,
 
 if (head) {
 ret = zero_l2_subclusters(bs, offset - head,
-  size_to_subclusters(s, head));
+  size_to_subclusters(s, head), flags);
 if (ret < 0) {
 goto fail;
 }
@@ -2314,7 +2324,8 @@ int coroutine_fn 
qcow2_subcluster_zeroize(BlockDriverState *bs, uint64_t offset,
 }
 
 if (tail) {
-ret = zero_l2_subclusters(bs, end_offset, size_to_subclusters(s, 
tail));
+ret = zero_l2_subclusters(bs, end_offset,
+  size_to_subclusters(s, tail), flags);
 if (ret < 0) {
 goto fail;
 }
-- 
2.39.3




Re: [BUG, RFC] Base node is in RW after making external snapshot

2024-05-10 Thread Andrey Drobyshev
On 4/24/24 21:00, Andrey Drobyshev wrote:
> Hi everyone,
> 
> When making an external snapshot, we end up in a situation when 2 block
> graph nodes related to the same image file (format and storage nodes)
> have different RO flags set on them.
> 
> E.g.
> 
> # ls -la /proc/PID/fd
> lrwx-- 1 root qemu 64 Apr 24 20:14 12 -> /path/to/harddisk.hdd
> 
> # virsh qemu-monitor-command VM '{"execute": "query-named-block-nodes"}'
> --pretty | egrep '"node-name"|"ro"'
>   "ro": false,
>   "node-name": "libvirt-1-format",
>   "ro": false,
>   "node-name": "libvirt-1-storage",
> 
> # virsh snapshot-create-as VM --name snap --disk-only
> Domain snapshot snap created
> 
> # ls -la /proc/PID/fd
> lr-x-- 1 root qemu 64 Apr 24 20:14 134 -> /path/to/harddisk.hdd
> lrwx-- 1 root qemu 64 Apr 24 20:14 135 -> /path/to/harddisk.snap
> 
> # virsh qemu-monitor-command VM '{"execute": "query-named-block-nodes"}'
> --pretty | egrep '"node-name"|"ro"'
>   "ro": false,
>   "node-name": "libvirt-2-format",
>   "ro": false,
>   "node-name": "libvirt-2-storage",
>   "ro": true,
>   "node-name": "libvirt-1-format",
>   "ro": false,<--
>   "node-name": "libvirt-1-storage",
> 
> File descriptor has been reopened in RO, but "libvirt-1-storage" node
> still has RW permissions set.
> 
> I'm wondering it this a bug or this is intended?  Looks like a bug to
> me, although I see that some iotests (e.g. 273) expect 2 nodes related
> to the same image file to have different RO flags.
> 
> bdrv_reopen_set_read_only()
>   bdrv_reopen()
> bdrv_reopen_queue()
>   bdrv_reopen_queue_child()
> bdrv_reopen_multiple()
>   bdrv_list_refresh_perms()
> bdrv_topological_dfs()
> bdrv_do_refresh_perms()
>   bdrv_reopen_commit()
> 
> In the stack above bdrv_reopen_set_read_only() is only being called for
> the parent (libvirt-1-format) node.  There're 2 lists: BDSs from
> refresh_list are used by bdrv_drv_set_perm and this leads to actual
> reopen with RO of the file descriptor.  And then there's reopen queue
> bs_queue -- BDSs from this queue get their parameters updated.  While
> refresh_list ends up having the whole subtree (including children, this
> is done in bdrv_topological_dfs()) bs_queue only has the parent.  And
> that is because storage (child) node's (bs->inherits_from == NULL), so
> bdrv_reopen_queue_child() never adds it to the queue.  Could it be the
> source of this bug?
> 
> Anyway, would greatly appreciate a clarification.
> 
> Andrey

Friendly ping.  Could somebody confirm that it is a bug indeed?




[BUG, RFC] Base node is in RW after making external snapshot

2024-04-24 Thread Andrey Drobyshev
Hi everyone,

When making an external snapshot, we end up in a situation when 2 block
graph nodes related to the same image file (format and storage nodes)
have different RO flags set on them.

E.g.

# ls -la /proc/PID/fd
lrwx-- 1 root qemu 64 Apr 24 20:14 12 -> /path/to/harddisk.hdd

# virsh qemu-monitor-command VM '{"execute": "query-named-block-nodes"}'
--pretty | egrep '"node-name"|"ro"'
  "ro": false,
  "node-name": "libvirt-1-format",
  "ro": false,
  "node-name": "libvirt-1-storage",

# virsh snapshot-create-as VM --name snap --disk-only
Domain snapshot snap created

# ls -la /proc/PID/fd
lr-x-- 1 root qemu 64 Apr 24 20:14 134 -> /path/to/harddisk.hdd
lrwx-- 1 root qemu 64 Apr 24 20:14 135 -> /path/to/harddisk.snap

# virsh qemu-monitor-command VM '{"execute": "query-named-block-nodes"}'
--pretty | egrep '"node-name"|"ro"'
  "ro": false,
  "node-name": "libvirt-2-format",
  "ro": false,
  "node-name": "libvirt-2-storage",
  "ro": true,
  "node-name": "libvirt-1-format",
  "ro": false,<--
  "node-name": "libvirt-1-storage",

File descriptor has been reopened in RO, but "libvirt-1-storage" node
still has RW permissions set.

I'm wondering it this a bug or this is intended?  Looks like a bug to
me, although I see that some iotests (e.g. 273) expect 2 nodes related
to the same image file to have different RO flags.

bdrv_reopen_set_read_only()
  bdrv_reopen()
bdrv_reopen_queue()
  bdrv_reopen_queue_child()
bdrv_reopen_multiple()
  bdrv_list_refresh_perms()
bdrv_topological_dfs()
bdrv_do_refresh_perms()
  bdrv_reopen_commit()

In the stack above bdrv_reopen_set_read_only() is only being called for
the parent (libvirt-1-format) node.  There're 2 lists: BDSs from
refresh_list are used by bdrv_drv_set_perm and this leads to actual
reopen with RO of the file descriptor.  And then there's reopen queue
bs_queue -- BDSs from this queue get their parameters updated.  While
refresh_list ends up having the whole subtree (including children, this
is done in bdrv_topological_dfs()) bs_queue only has the parent.  And
that is because storage (child) node's (bs->inherits_from == NULL), so
bdrv_reopen_queue_child() never adds it to the queue.  Could it be the
source of this bug?

Anyway, would greatly appreciate a clarification.

Andrey



Re: [PATCH 4/7] qcow2: make subclusters discardable

2024-04-16 Thread Andrey Drobyshev
On 10/27/23 14:10, Jean-Louis Dupond wrote:
> [...]
> 
> I've checked all the code paths, and as far as I see it nowhere breaks
> the discard_no_unref option.
> It's important that we don't introduce new code paths that can make
> holes in the qcow2 image when this option is enabled :)
> 
> If you can confirm my conclusion, that would be great.
> 
> 
> Thanks
> Jean-Louis
> 

Hi Jean-Louis,

I've finally got to working on v2 for this series.  However I'm failing
to get a grasp on what this option is supposed to be doing and what are
we trying to avoid here.

Consider this simple example:

# cd build
# ./qemu-img create -f qcow2   unref.qcow2 192K
# ./qemu-img create -f qcow2 nounref.qcow2 192K
# ./qemu-io -c "write 0 192K"   unref.qcow2
# ./qemu-io -c "write 0 192K" nounref.qcow2
#
# strace -fv -e fallocate ./qemu-io -c "discard 64K 64K" unref.qcow2
[pid 887710] fallocate(9, FALLOC_FL_KEEP_SIZE|FALLOC_FL_PUNCH_HOLE,
393216, 65536) = 0
discard 65536/65536 bytes at offset 65536
64 KiB, 1 ops; 00.00 sec (252.123 MiB/sec and 4033.9660 ops/sec)
#
# strace -fv -e fallocate ./qemu-io -c "reopen -o discard-no-unref=on"
-c "discard 64K 64K" nounref.qcow2
# [pid 887789] fallocate(9, FALLOC_FL_KEEP_SIZE|FALLOC_FL_PUNCH_HOLE,
393216, 65536) = 0
discard 65536/65536 bytes at offset 65536
64 KiB, 1 ops; 00.00 sec (345.457 MiB/sec and 5527.3049 ops/sec)
#
# ./qemu-img check unref.qcow2

No errors were found on the image.
2/3 = 66.67% allocated, 50.00% fragmented, 0.00% compressed clusters
Image end offset: 524288
# ./qemu-img check nounref.qcow2
No errors were found on the image.
3/3 = 100.00% allocated, 0.00% fragmented, 0.00% compressed clusters
Image end offset: 524288
#
# ls -la *.qcow2

-rw-r--r-- 1 root root 524288 Apr 16 22:42 nounref.qcow2
-rw-r--r-- 1 root root 524288 Apr 16 22:41 unref.qcow2
# du --block-size=1 *.qcow2
397312  nounref.qcow2
397312  unref.qcow2

I understand that by keeping the L2 entry we achieve that cluster
remains formally allocated, but no matter whether "discard-no-unref"
option is enabled fallocate(FALLOC_FL_KEEP_SIZE|FALLOC_FL_PUNCH_HOLE) is
being called leaving a hole in the file (e.g. file becomes sparse).
However you say in the comment above that we can't allow making new
holes in the file when this option is enabled.  How does that correlate
and what do we achieve?  And which logic do you think we need to follow
when discarding separate subclusters?

Andrey



Re: [PATCH v4 1/7] qga: guest-get-fsinfo: add optional 'total-bytes-privileged' field

2024-04-01 Thread Andrey Drobyshev
On 3/22/24 15:17, Andrey Drobyshev wrote:
> On 3/22/24 12:39, Daniel P. Berrangé wrote:
>> On Wed, Mar 20, 2024 at 06:16:42PM +0200, Andrey Drobyshev wrote:
>>> Since the commit 25b5ff1a86 ("qga: add mountpoint usage info to
>>> GuestFilesystemInfo") we have 2 values reported in guest-get-fsinfo:
>>> used = (f_blocks - f_bfree), total = (f_blocks - f_bfree + f_bavail) as
>>> returned by statvfs(3).  While on Windows guests that's all we can get
>>> with GetDiskFreeSpaceExA(), on POSIX guests we might also be interested in
>>> total file system size, as it's visible for root user.  Let's add an
>>> optional field 'total-bytes-privileged' to GuestFilesystemInfo struct,
>>> which'd only be reported on POSIX and represent f_blocks value as returned
>>> by statvfs(3).
>>>
>>> While here, also tweak the docs to reflect better where those values
>>> come from.
>>>
>>> Signed-off-by: Andrey Drobyshev 
>>> ---
>>>  qga/commands-posix.c | 2 ++
>>>  qga/commands-win32.c | 1 +
>>>  qga/qapi-schema.json | 7 +--
>>>  3 files changed, 8 insertions(+), 2 deletions(-)
>>>
>>> diff --git a/qga/commands-posix.c b/qga/commands-posix.c
>>> index 26008db497..7df2d72e9f 100644
>>> --- a/qga/commands-posix.c
>>> +++ b/qga/commands-posix.c
>>> @@ -1569,8 +1569,10 @@ static GuestFilesystemInfo 
>>> *build_guest_fsinfo(struct FsMount *mount,
>>>  nonroot_total = used + buf.f_bavail;
>>>  fs->used_bytes = used * fr_size;
>>>  fs->total_bytes = nonroot_total * fr_size;
>>> +fs->total_bytes_privileged = buf.f_blocks * fr_size;
>>>  
>>>  fs->has_total_bytes = true;
>>> +fs->has_total_bytes_privileged = true;
>>>  fs->has_used_bytes = true;
>>>  }
>>>  
>>> diff --git a/qga/commands-win32.c b/qga/commands-win32.c
>>> index 6242737b00..6fee0e1e6f 100644
>>> --- a/qga/commands-win32.c
>>> +++ b/qga/commands-win32.c
>>> @@ -1143,6 +1143,7 @@ static GuestFilesystemInfo *build_guest_fsinfo(char 
>>> *guid, Error **errp)
>>>  fs = g_malloc(sizeof(*fs));
>>>  fs->name = g_strdup(guid);
>>>  fs->has_total_bytes = false;
>>> +fs->has_total_bytes_privileged = false;
>>>  fs->has_used_bytes = false;
>>>  if (len == 0) {
>>>  fs->mountpoint = g_strdup("System Reserved");
>>> diff --git a/qga/qapi-schema.json b/qga/qapi-schema.json
>>> index 9554b566a7..dcc469b268 100644
>>> --- a/qga/qapi-schema.json
>>> +++ b/qga/qapi-schema.json
>>> @@ -1026,7 +1026,10 @@
>>>  #
>>>  # @used-bytes: file system used bytes (since 3.0)
>>>  #
>>> -# @total-bytes: non-root file system total bytes (since 3.0)
>>> +# @total-bytes: filesystem capacity in bytes for unprivileged users (since 
>>> 3.0)
>>> +#
>>> +# @total-bytes-privileged: filesystem capacity in bytes for privileged 
>>> users
>>> +# (since 9.0)
>>
>> Will need to be '9.1', not '9.0', since we don't accept new features
>> during freeze periods.
>>
>>>  #
>>>  # @disk: an array of disk hardware information that the volume lies
>>>  # on, which may be empty if the disk type is not supported
>>> @@ -1036,7 +1039,7 @@
>>>  { 'struct': 'GuestFilesystemInfo',
>>>'data': {'name': 'str', 'mountpoint': 'str', 'type': 'str',
>>> '*used-bytes': 'uint64', '*total-bytes': 'uint64',
>>> -   'disk': ['GuestDiskAddress']} }
>>> +   '*total-bytes-privileged': 'uint64', 'disk': 
>>> ['GuestDiskAddress']} }
>>>  
>>>  ##
>>>  # @guest-get-fsinfo:
>>
>> Assuming the version is changed:
>>
>>   Reviewed-by: Daniel P. Berrangé 
>>
>> With regards,
>> Daniel
> 
> Thanks, Daniel.
> 
> @kkostiuk do I need to send one more iteration with the version change?
> Whatever is most convenient for you.
> 
> Andrey

Ping



Re: [PATCH v4 1/7] qga: guest-get-fsinfo: add optional 'total-bytes-privileged' field

2024-03-22 Thread Andrey Drobyshev
On 3/22/24 12:39, Daniel P. Berrangé wrote:
> On Wed, Mar 20, 2024 at 06:16:42PM +0200, Andrey Drobyshev wrote:
>> Since the commit 25b5ff1a86 ("qga: add mountpoint usage info to
>> GuestFilesystemInfo") we have 2 values reported in guest-get-fsinfo:
>> used = (f_blocks - f_bfree), total = (f_blocks - f_bfree + f_bavail) as
>> returned by statvfs(3).  While on Windows guests that's all we can get
>> with GetDiskFreeSpaceExA(), on POSIX guests we might also be interested in
>> total file system size, as it's visible for root user.  Let's add an
>> optional field 'total-bytes-privileged' to GuestFilesystemInfo struct,
>> which'd only be reported on POSIX and represent f_blocks value as returned
>> by statvfs(3).
>>
>> While here, also tweak the docs to reflect better where those values
>> come from.
>>
>> Signed-off-by: Andrey Drobyshev 
>> ---
>>  qga/commands-posix.c | 2 ++
>>  qga/commands-win32.c | 1 +
>>  qga/qapi-schema.json | 7 +--
>>  3 files changed, 8 insertions(+), 2 deletions(-)
>>
>> diff --git a/qga/commands-posix.c b/qga/commands-posix.c
>> index 26008db497..7df2d72e9f 100644
>> --- a/qga/commands-posix.c
>> +++ b/qga/commands-posix.c
>> @@ -1569,8 +1569,10 @@ static GuestFilesystemInfo *build_guest_fsinfo(struct 
>> FsMount *mount,
>>  nonroot_total = used + buf.f_bavail;
>>  fs->used_bytes = used * fr_size;
>>  fs->total_bytes = nonroot_total * fr_size;
>> +fs->total_bytes_privileged = buf.f_blocks * fr_size;
>>  
>>  fs->has_total_bytes = true;
>> +fs->has_total_bytes_privileged = true;
>>  fs->has_used_bytes = true;
>>  }
>>  
>> diff --git a/qga/commands-win32.c b/qga/commands-win32.c
>> index 6242737b00..6fee0e1e6f 100644
>> --- a/qga/commands-win32.c
>> +++ b/qga/commands-win32.c
>> @@ -1143,6 +1143,7 @@ static GuestFilesystemInfo *build_guest_fsinfo(char 
>> *guid, Error **errp)
>>  fs = g_malloc(sizeof(*fs));
>>  fs->name = g_strdup(guid);
>>  fs->has_total_bytes = false;
>> +fs->has_total_bytes_privileged = false;
>>  fs->has_used_bytes = false;
>>  if (len == 0) {
>>  fs->mountpoint = g_strdup("System Reserved");
>> diff --git a/qga/qapi-schema.json b/qga/qapi-schema.json
>> index 9554b566a7..dcc469b268 100644
>> --- a/qga/qapi-schema.json
>> +++ b/qga/qapi-schema.json
>> @@ -1026,7 +1026,10 @@
>>  #
>>  # @used-bytes: file system used bytes (since 3.0)
>>  #
>> -# @total-bytes: non-root file system total bytes (since 3.0)
>> +# @total-bytes: filesystem capacity in bytes for unprivileged users (since 
>> 3.0)
>> +#
>> +# @total-bytes-privileged: filesystem capacity in bytes for privileged users
>> +# (since 9.0)
> 
> Will need to be '9.1', not '9.0', since we don't accept new features
> during freeze periods.
> 
>>  #
>>  # @disk: an array of disk hardware information that the volume lies
>>  # on, which may be empty if the disk type is not supported
>> @@ -1036,7 +1039,7 @@
>>  { 'struct': 'GuestFilesystemInfo',
>>'data': {'name': 'str', 'mountpoint': 'str', 'type': 'str',
>> '*used-bytes': 'uint64', '*total-bytes': 'uint64',
>> -   'disk': ['GuestDiskAddress']} }
>> +   '*total-bytes-privileged': 'uint64', 'disk': 
>> ['GuestDiskAddress']} }
>>  
>>  ##
>>  # @guest-get-fsinfo:
> 
> Assuming the version is changed:
> 
>   Reviewed-by: Daniel P. Berrangé 
> 
> With regards,
> Daniel

Thanks, Daniel.

@kkostiuk do I need to send one more iteration with the version change?
Whatever is most convenient for you.

Andrey



[PATCH v4 3/7] qga/commands-posix: qmp_guest_shutdown: use ga_run_command helper

2024-03-20 Thread Andrey Drobyshev
Also remove the G_GNUC_UNUSED attribute added in the previous commit from
the helper.

Signed-off-by: Andrey Drobyshev 
Reviewed-by: Daniel P. Berrangé 
---
 qga/commands-posix.c | 39 ++-
 1 file changed, 6 insertions(+), 33 deletions(-)

diff --git a/qga/commands-posix.c b/qga/commands-posix.c
index 9b1bdf194c..cb9eed9a0b 100644
--- a/qga/commands-posix.c
+++ b/qga/commands-posix.c
@@ -108,7 +108,6 @@ static ssize_t ga_pipe_read_str(int fd[2], char **str)
  * sending string to stdin and taking error message from
  * stdout/err.
  */
-G_GNUC_UNUSED
 static int ga_run_command(const char *argv[], const char *in_str,
   const char *action, Error **errp)
 {
@@ -230,8 +229,6 @@ void qmp_guest_shutdown(const char *mode, Error **errp)
 {
 const char *shutdown_flag;
 Error *local_err = NULL;
-pid_t pid;
-int status;
 
 #ifdef CONFIG_SOLARIS
 const char *powerdown_flag = "-i5";
@@ -260,46 +257,22 @@ void qmp_guest_shutdown(const char *mode, Error **errp)
 return;
 }
 
-pid = fork();
-if (pid == 0) {
-/* child, start the shutdown */
-setsid();
-reopen_fd_to_null(0);
-reopen_fd_to_null(1);
-reopen_fd_to_null(2);
-
+const char *argv[] = {"/sbin/shutdown",
 #ifdef CONFIG_SOLARIS
-execl("/sbin/shutdown", "shutdown", shutdown_flag, "-g0", "-y",
-  "hypervisor initiated shutdown", (char *)NULL);
+  shutdown_flag, "-g0", "-y",
 #elif defined(CONFIG_BSD)
-execl("/sbin/shutdown", "shutdown", shutdown_flag, "+0",
-   "hypervisor initiated shutdown", (char *)NULL);
+  shutdown_flag, "+0",
 #else
-execl("/sbin/shutdown", "shutdown", "-h", shutdown_flag, "+0",
-   "hypervisor initiated shutdown", (char *)NULL);
+  "-h", shutdown_flag, "+0",
 #endif
-_exit(EXIT_FAILURE);
-} else if (pid < 0) {
-error_setg_errno(errp, errno, "failed to create child process");
-return;
-}
+  "hypervisor initiated shutdown", (char *) NULL};
 
-ga_wait_child(pid, , _err);
+ga_run_command(argv, NULL, "shutdown", _err);
 if (local_err) {
 error_propagate(errp, local_err);
 return;
 }
 
-if (!WIFEXITED(status)) {
-error_setg(errp, "child process has terminated abnormally");
-return;
-}
-
-if (WEXITSTATUS(status)) {
-error_setg(errp, "child process has failed to shutdown");
-return;
-}
-
 /* succeeded */
 }
 
-- 
2.39.3




[PATCH v4 1/7] qga: guest-get-fsinfo: add optional 'total-bytes-privileged' field

2024-03-20 Thread Andrey Drobyshev
Since the commit 25b5ff1a86 ("qga: add mountpoint usage info to
GuestFilesystemInfo") we have 2 values reported in guest-get-fsinfo:
used = (f_blocks - f_bfree), total = (f_blocks - f_bfree + f_bavail) as
returned by statvfs(3).  While on Windows guests that's all we can get
with GetDiskFreeSpaceExA(), on POSIX guests we might also be interested in
total file system size, as it's visible for root user.  Let's add an
optional field 'total-bytes-privileged' to GuestFilesystemInfo struct,
which'd only be reported on POSIX and represent f_blocks value as returned
by statvfs(3).

While here, also tweak the docs to reflect better where those values
come from.

Signed-off-by: Andrey Drobyshev 
---
 qga/commands-posix.c | 2 ++
 qga/commands-win32.c | 1 +
 qga/qapi-schema.json | 7 +--
 3 files changed, 8 insertions(+), 2 deletions(-)

diff --git a/qga/commands-posix.c b/qga/commands-posix.c
index 26008db497..7df2d72e9f 100644
--- a/qga/commands-posix.c
+++ b/qga/commands-posix.c
@@ -1569,8 +1569,10 @@ static GuestFilesystemInfo *build_guest_fsinfo(struct 
FsMount *mount,
 nonroot_total = used + buf.f_bavail;
 fs->used_bytes = used * fr_size;
 fs->total_bytes = nonroot_total * fr_size;
+fs->total_bytes_privileged = buf.f_blocks * fr_size;
 
 fs->has_total_bytes = true;
+fs->has_total_bytes_privileged = true;
 fs->has_used_bytes = true;
 }
 
diff --git a/qga/commands-win32.c b/qga/commands-win32.c
index 6242737b00..6fee0e1e6f 100644
--- a/qga/commands-win32.c
+++ b/qga/commands-win32.c
@@ -1143,6 +1143,7 @@ static GuestFilesystemInfo *build_guest_fsinfo(char 
*guid, Error **errp)
 fs = g_malloc(sizeof(*fs));
 fs->name = g_strdup(guid);
 fs->has_total_bytes = false;
+fs->has_total_bytes_privileged = false;
 fs->has_used_bytes = false;
 if (len == 0) {
 fs->mountpoint = g_strdup("System Reserved");
diff --git a/qga/qapi-schema.json b/qga/qapi-schema.json
index 9554b566a7..dcc469b268 100644
--- a/qga/qapi-schema.json
+++ b/qga/qapi-schema.json
@@ -1026,7 +1026,10 @@
 #
 # @used-bytes: file system used bytes (since 3.0)
 #
-# @total-bytes: non-root file system total bytes (since 3.0)
+# @total-bytes: filesystem capacity in bytes for unprivileged users (since 3.0)
+#
+# @total-bytes-privileged: filesystem capacity in bytes for privileged users
+# (since 9.0)
 #
 # @disk: an array of disk hardware information that the volume lies
 # on, which may be empty if the disk type is not supported
@@ -1036,7 +1039,7 @@
 { 'struct': 'GuestFilesystemInfo',
   'data': {'name': 'str', 'mountpoint': 'str', 'type': 'str',
'*used-bytes': 'uint64', '*total-bytes': 'uint64',
-   'disk': ['GuestDiskAddress']} }
+   '*total-bytes-privileged': 'uint64', 'disk': ['GuestDiskAddress']} }
 
 ##
 # @guest-get-fsinfo:
-- 
2.39.3




[PATCH v4 5/7] qga/commands-posix: execute_fsfreeze_hook: use ga_run_command helper

2024-03-20 Thread Andrey Drobyshev
There's no need to check for the existence of the hook executable, as the
exec() call will do that for us.

Signed-off-by: Andrey Drobyshev 
Reviewed-by: Daniel P. Berrangé 
---
 qga/commands-posix.c | 35 +++
 1 file changed, 3 insertions(+), 32 deletions(-)

diff --git a/qga/commands-posix.c b/qga/commands-posix.c
index 545f3c99dc..9b993772f5 100644
--- a/qga/commands-posix.c
+++ b/qga/commands-posix.c
@@ -736,8 +736,6 @@ static const char *fsfreeze_hook_arg_string[] = {
 
 static void execute_fsfreeze_hook(FsfreezeHookArg arg, Error **errp)
 {
-int status;
-pid_t pid;
 const char *hook;
 const char *arg_str = fsfreeze_hook_arg_string[arg];
 Error *local_err = NULL;
@@ -746,42 +744,15 @@ static void execute_fsfreeze_hook(FsfreezeHookArg arg, 
Error **errp)
 if (!hook) {
 return;
 }
-if (access(hook, X_OK) != 0) {
-error_setg_errno(errp, errno, "can't access fsfreeze hook '%s'", hook);
-return;
-}
 
-slog("executing fsfreeze hook with arg '%s'", arg_str);
-pid = fork();
-if (pid == 0) {
-setsid();
-reopen_fd_to_null(0);
-reopen_fd_to_null(1);
-reopen_fd_to_null(2);
-
-execl(hook, hook, arg_str, NULL);
-_exit(EXIT_FAILURE);
-} else if (pid < 0) {
-error_setg_errno(errp, errno, "failed to create child process");
-return;
-}
+const char *argv[] = {hook, arg_str, NULL};
 
-ga_wait_child(pid, , _err);
+slog("executing fsfreeze hook with arg '%s'", arg_str);
+ga_run_command(argv, NULL, "execute fsfreeze hook", _err);
 if (local_err) {
 error_propagate(errp, local_err);
 return;
 }
-
-if (!WIFEXITED(status)) {
-error_setg(errp, "fsfreeze hook has terminated abnormally");
-return;
-}
-
-status = WEXITSTATUS(status);
-if (status) {
-error_setg(errp, "fsfreeze hook has failed with status %d", status);
-return;
-}
 }
 
 /*
-- 
2.39.3




[PATCH v4 0/7] qga/commands-posix: replace code duplicating commands with a helper

2024-03-20 Thread Andrey Drobyshev
v3 -> v4:
  * Patch 1/7:
- Replaced "since 8.3" with "since 9.0" as we're now at v9.0.0-rc0;
- Renamed the field to 'total-bytes-privileged';
- Got rid of the implementation details in the docs;
  * Patch 6/7: added g_autoptr macro to local error declaration.

v3: https://lists.nongnu.org/archive/html/qemu-devel/2024-03/msg04068.html

Andrey Drobyshev (7):
  qga: guest-get-fsinfo: add optional 'total-bytes-privileged' field
  qga: introduce ga_run_command() helper for guest cmd execution
  qga/commands-posix: qmp_guest_shutdown: use ga_run_command helper
  qga/commands-posix: qmp_guest_set_time: use ga_run_command helper
  qga/commands-posix: execute_fsfreeze_hook: use ga_run_command helper
  qga/commands-posix: don't do fork()/exec() when suspending via sysfs
  qga/commands-posix: qmp_guest_set_user_password: use ga_run_command
helper

 qga/commands-posix.c | 404 +++
 qga/commands-win32.c |   1 +
 qga/qapi-schema.json |   7 +-
 3 files changed, 187 insertions(+), 225 deletions(-)

-- 
2.39.3




[PATCH v4 6/7] qga/commands-posix: don't do fork()/exec() when suspending via sysfs

2024-03-20 Thread Andrey Drobyshev
Since commit 246d76eba ("qga: guest_suspend: decoupling pm-utils and sys
logic") pm-utils logic is running in a separate child from the sysfs
logic.  Now when suspending via sysfs we don't really need to do that in
a separate process as we only need to perform one write to /sys/power/state.

Let's just use g_file_set_contents() to simplify things here.

Suggested-by: Daniel P. Berrangé 
Signed-off-by: Andrey Drobyshev 
Reviewed-by: Daniel P. Berrangé 
---
 qga/commands-posix.c | 41 +
 1 file changed, 5 insertions(+), 36 deletions(-)

diff --git a/qga/commands-posix.c b/qga/commands-posix.c
index 9b993772f5..9910957ff5 100644
--- a/qga/commands-posix.c
+++ b/qga/commands-posix.c
@@ -1928,52 +1928,21 @@ static bool linux_sys_state_supports_mode(SuspendMode 
mode, Error **errp)
 
 static void linux_sys_state_suspend(SuspendMode mode, Error **errp)
 {
-Error *local_err = NULL;
+g_autoptr(GError) local_gerr = NULL;
 const char *sysfile_strs[3] = {"disk", "mem", NULL};
 const char *sysfile_str = sysfile_strs[mode];
-pid_t pid;
-int status;
 
 if (!sysfile_str) {
 error_setg(errp, "unknown guest suspend mode");
 return;
 }
 
-pid = fork();
-if (!pid) {
-/* child */
-int fd;
-
-setsid();
-reopen_fd_to_null(0);
-reopen_fd_to_null(1);
-reopen_fd_to_null(2);
-
-fd = open(LINUX_SYS_STATE_FILE, O_WRONLY);
-if (fd < 0) {
-_exit(EXIT_FAILURE);
-}
-
-if (write(fd, sysfile_str, strlen(sysfile_str)) < 0) {
-_exit(EXIT_FAILURE);
-}
-
-_exit(EXIT_SUCCESS);
-} else if (pid < 0) {
-error_setg_errno(errp, errno, "failed to create child process");
-return;
-}
-
-ga_wait_child(pid, , _err);
-if (local_err) {
-error_propagate(errp, local_err);
+if (!g_file_set_contents(LINUX_SYS_STATE_FILE, sysfile_str,
+ -1, _gerr)) {
+error_setg(errp, "suspend: cannot write to '%s': %s",
+   LINUX_SYS_STATE_FILE, local_gerr->message);
 return;
 }
-
-if (WEXITSTATUS(status)) {
-error_setg(errp, "child process has failed to suspend");
-}
-
 }
 
 static void guest_suspend(SuspendMode mode, Error **errp)
-- 
2.39.3




[PATCH v4 4/7] qga/commands-posix: qmp_guest_set_time: use ga_run_command helper

2024-03-20 Thread Andrey Drobyshev
There's no need to check for the existence of "/sbin/hwclock", the
exec() call will do that for us.

Signed-off-by: Andrey Drobyshev 
Reviewed-by: Daniel P. Berrangé 
---
 qga/commands-posix.c | 43 +++
 1 file changed, 3 insertions(+), 40 deletions(-)

diff --git a/qga/commands-posix.c b/qga/commands-posix.c
index cb9eed9a0b..545f3c99dc 100644
--- a/qga/commands-posix.c
+++ b/qga/commands-posix.c
@@ -279,21 +279,9 @@ void qmp_guest_shutdown(const char *mode, Error **errp)
 void qmp_guest_set_time(bool has_time, int64_t time_ns, Error **errp)
 {
 int ret;
-int status;
-pid_t pid;
 Error *local_err = NULL;
 struct timeval tv;
-static const char hwclock_path[] = "/sbin/hwclock";
-static int hwclock_available = -1;
-
-if (hwclock_available < 0) {
-hwclock_available = (access(hwclock_path, X_OK) == 0);
-}
-
-if (!hwclock_available) {
-error_setg(errp, QERR_UNSUPPORTED);
-return;
-}
+const char *argv[] = {"/sbin/hwclock", has_time ? "-w" : "-s", NULL};
 
 /* If user has passed a time, validate and set it. */
 if (has_time) {
@@ -324,37 +312,12 @@ void qmp_guest_set_time(bool has_time, int64_t time_ns, 
Error **errp)
  * just need to synchronize the hardware clock. However, if no time was
  * passed, user is requesting the opposite: set the system time from the
  * hardware clock (RTC). */
-pid = fork();
-if (pid == 0) {
-setsid();
-reopen_fd_to_null(0);
-reopen_fd_to_null(1);
-reopen_fd_to_null(2);
-
-/* Use '/sbin/hwclock -w' to set RTC from the system time,
- * or '/sbin/hwclock -s' to set the system time from RTC. */
-execl(hwclock_path, "hwclock", has_time ? "-w" : "-s", NULL);
-_exit(EXIT_FAILURE);
-} else if (pid < 0) {
-error_setg_errno(errp, errno, "failed to create child process");
-return;
-}
-
-ga_wait_child(pid, , _err);
+ga_run_command(argv, NULL, "set hardware clock to system time",
+   _err);
 if (local_err) {
 error_propagate(errp, local_err);
 return;
 }
-
-if (!WIFEXITED(status)) {
-error_setg(errp, "child process has terminated abnormally");
-return;
-}
-
-if (WEXITSTATUS(status)) {
-error_setg(errp, "hwclock failed to set hardware clock to system 
time");
-return;
-}
 }
 
 typedef enum {
-- 
2.39.3




[PATCH v4 7/7] qga/commands-posix: qmp_guest_set_user_password: use ga_run_command helper

2024-03-20 Thread Andrey Drobyshev
There's no need to check for the existence of the "chpasswd", "pw"
executables, as the exec() call will do that for us.

Signed-off-by: Andrey Drobyshev 
Reviewed-by: Daniel P. Berrangé 
---
 qga/commands-posix.c | 96 ++--
 1 file changed, 13 insertions(+), 83 deletions(-)

diff --git a/qga/commands-posix.c b/qga/commands-posix.c
index 9910957ff5..7a065c4085 100644
--- a/qga/commands-posix.c
+++ b/qga/commands-posix.c
@@ -2151,14 +2151,8 @@ void qmp_guest_set_user_password(const char *username,
  Error **errp)
 {
 Error *local_err = NULL;
-char *passwd_path = NULL;
-pid_t pid;
-int status;
-int datafd[2] = { -1, -1 };
-char *rawpasswddata = NULL;
+g_autofree char *rawpasswddata = NULL;
 size_t rawpasswdlen;
-char *chpasswddata = NULL;
-size_t chpasswdlen;
 
 rawpasswddata = (char *)qbase64_decode(password, -1, , errp);
 if (!rawpasswddata) {
@@ -2169,95 +2163,31 @@ void qmp_guest_set_user_password(const char *username,
 
 if (strchr(rawpasswddata, '\n')) {
 error_setg(errp, "forbidden characters in raw password");
-goto out;
+return;
 }
 
 if (strchr(username, '\n') ||
 strchr(username, ':')) {
 error_setg(errp, "forbidden characters in username");
-goto out;
+return;
 }
 
 #ifdef __FreeBSD__
-chpasswddata = g_strdup(rawpasswddata);
-passwd_path = g_find_program_in_path("pw");
+g_autofree char *chpasswdata = g_strdup(rawpasswddata);
+const char *crypt_flag = crypted ? "-H" : "-h";
+const char *argv[] = {"pw", "usermod", "-n", username,
+  crypt_flag, "0", NULL};
 #else
-chpasswddata = g_strdup_printf("%s:%s\n", username, rawpasswddata);
-passwd_path = g_find_program_in_path("chpasswd");
+g_autofree char *chpasswddata = g_strdup_printf("%s:%s\n", username,
+rawpasswddata);
+const char *crypt_flag = crypted ? "-e" : NULL;
+const char *argv[] = {"chpasswd", crypt_flag, NULL};
 #endif
 
-chpasswdlen = strlen(chpasswddata);
-
-if (!passwd_path) {
-error_setg(errp, "cannot find 'passwd' program in PATH");
-goto out;
-}
-
-if (!g_unix_open_pipe(datafd, FD_CLOEXEC, NULL)) {
-error_setg(errp, "cannot create pipe FDs");
-goto out;
-}
-
-pid = fork();
-if (pid == 0) {
-close(datafd[1]);
-/* child */
-setsid();
-dup2(datafd[0], 0);
-reopen_fd_to_null(1);
-reopen_fd_to_null(2);
-
-#ifdef __FreeBSD__
-const char *h_arg;
-h_arg = (crypted) ? "-H" : "-h";
-execl(passwd_path, "pw", "usermod", "-n", username, h_arg, "0", NULL);
-#else
-if (crypted) {
-execl(passwd_path, "chpasswd", "-e", NULL);
-} else {
-execl(passwd_path, "chpasswd", NULL);
-}
-#endif
-_exit(EXIT_FAILURE);
-} else if (pid < 0) {
-error_setg_errno(errp, errno, "failed to create child process");
-goto out;
-}
-close(datafd[0]);
-datafd[0] = -1;
-
-if (qemu_write_full(datafd[1], chpasswddata, chpasswdlen) != chpasswdlen) {
-error_setg_errno(errp, errno, "cannot write new account password");
-goto out;
-}
-close(datafd[1]);
-datafd[1] = -1;
-
-ga_wait_child(pid, , _err);
+ga_run_command(argv, chpasswddata, "set user password", _err);
 if (local_err) {
 error_propagate(errp, local_err);
-goto out;
-}
-
-if (!WIFEXITED(status)) {
-error_setg(errp, "child process has terminated abnormally");
-goto out;
-}
-
-if (WEXITSTATUS(status)) {
-error_setg(errp, "child process has failed to set user password");
-goto out;
-}
-
-out:
-g_free(chpasswddata);
-g_free(rawpasswddata);
-g_free(passwd_path);
-if (datafd[0] != -1) {
-close(datafd[0]);
-}
-if (datafd[1] != -1) {
-close(datafd[1]);
+return;
 }
 }
 #else /* __linux__ || __FreeBSD__ */
-- 
2.39.3




[PATCH v4 2/7] qga: introduce ga_run_command() helper for guest cmd execution

2024-03-20 Thread Andrey Drobyshev
When executing guest commands in *nix environment, we repeat the same
fork/exec pattern multiple times.  Let's just separate it into a single
helper which would also be able to feed input data into the launched
process' stdin.  This way we can avoid code duplication.

To keep the history more bisectable, let's replace qmp commands
implementations one by one.  Also add G_GNUC_UNUSED attribute to the
helper and remove it in the next commit.

Originally-by: Yuri Pudgorodskiy 
Signed-off-by: Andrey Drobyshev 
Reviewed-by: Daniel P. Berrangé 
---
 qga/commands-posix.c | 150 +++
 1 file changed, 150 insertions(+)

diff --git a/qga/commands-posix.c b/qga/commands-posix.c
index 7df2d72e9f..9b1bdf194c 100644
--- a/qga/commands-posix.c
+++ b/qga/commands-posix.c
@@ -76,6 +76,156 @@ static void ga_wait_child(pid_t pid, int *status, Error 
**errp)
 g_assert(rpid == pid);
 }
 
+static ssize_t ga_pipe_read_str(int fd[2], char **str)
+{
+ssize_t n, len = 0;
+char buf[1024];
+
+close(fd[1]);
+fd[1] = -1;
+while ((n = read(fd[0], buf, sizeof(buf))) != 0) {
+if (n < 0) {
+if (errno == EINTR) {
+continue;
+} else {
+len = -errno;
+break;
+}
+}
+*str = g_realloc(*str, len + n + 1);
+memcpy(*str + len, buf, n);
+len += n;
+*str[len] = '\0';
+}
+close(fd[0]);
+fd[0] = -1;
+
+return len;
+}
+
+/*
+ * Helper to run command with input/output redirection,
+ * sending string to stdin and taking error message from
+ * stdout/err.
+ */
+G_GNUC_UNUSED
+static int ga_run_command(const char *argv[], const char *in_str,
+  const char *action, Error **errp)
+{
+pid_t pid;
+int status;
+int retcode = -1;
+int infd[2] = { -1, -1 };
+int outfd[2] = { -1, -1 };
+char *str = NULL;
+ssize_t len = 0;
+
+if ((in_str && !g_unix_open_pipe(infd, FD_CLOEXEC, NULL)) ||
+!g_unix_open_pipe(outfd, FD_CLOEXEC, NULL)) {
+error_setg(errp, "cannot create pipe FDs");
+goto out;
+}
+
+pid = fork();
+if (pid == 0) {
+char *cherr = NULL;
+
+setsid();
+
+if (in_str) {
+/* Redirect stdin to infd. */
+close(infd[1]);
+dup2(infd[0], 0);
+close(infd[0]);
+} else {
+reopen_fd_to_null(0);
+}
+
+/* Redirect stdout/stderr to outfd. */
+close(outfd[0]);
+dup2(outfd[1], 1);
+dup2(outfd[1], 2);
+close(outfd[1]);
+
+execvp(argv[0], (char *const *)argv);
+
+/* Write the cause of failed exec to pipe for the parent to read it. */
+cherr = g_strdup_printf("failed to exec '%s'", argv[0]);
+perror(cherr);
+g_free(cherr);
+_exit(EXIT_FAILURE);
+} else if (pid < 0) {
+error_setg_errno(errp, errno, "failed to create child process");
+goto out;
+}
+
+if (in_str) {
+close(infd[0]);
+infd[0] = -1;
+if (qemu_write_full(infd[1], in_str, strlen(in_str)) !=
+strlen(in_str)) {
+error_setg_errno(errp, errno, "%s: cannot write to stdin pipe",
+ action);
+goto out;
+}
+close(infd[1]);
+infd[1] = -1;
+}
+
+len = ga_pipe_read_str(outfd, );
+if (len < 0) {
+error_setg_errno(errp, -len, "%s: cannot read from stdout/stderr pipe",
+ action);
+goto out;
+}
+
+ga_wait_child(pid, , errp);
+if (*errp) {
+goto out;
+}
+
+if (!WIFEXITED(status)) {
+if (len) {
+error_setg(errp, "child process has terminated abnormally: %s",
+   str);
+} else {
+error_setg(errp, "child process has terminated abnormally");
+}
+goto out;
+}
+
+retcode = WEXITSTATUS(status);
+
+if (WEXITSTATUS(status)) {
+if (len) {
+error_setg(errp, "child process has failed to %s: %s",
+   action, str);
+} else {
+error_setg(errp, "child process has failed to %s: exit status %d",
+   action, WEXITSTATUS(status));
+}
+goto out;
+}
+
+out:
+g_free(str);
+
+if (infd[0] != -1) {
+close(infd[0]);
+}
+if (infd[1] != -1) {
+close(infd[1]);
+}
+if (outfd[0] != -1) {
+close(outfd[0]);
+}
+if (outfd[1] != -1) {
+close(outfd[1]);
+}
+
+return retcode;
+}
+
 void qmp_guest_shutdown(const char *mode, Error **errp)
 {
 const char *shutdown_flag;
-- 
2.39.3




Re: [PATCH v3 1/7] qga: guest-get-fsinfo: add optional 'total-bytes-root' field

2024-03-20 Thread Andrey Drobyshev
On 3/19/24 19:37, Daniel P. Berrangé wrote:
> On Fri, Mar 15, 2024 at 02:29:40PM +0200, Andrey Drobyshev wrote:
>> Since the commit 25b5ff1a86 ("qga: add mountpoint usage info to
>> GuestFilesystemInfo") we have 2 values reported in guest-get-fsinfo:
>> used = (f_blocks - f_bfree), total = (f_blocks - f_bfree + f_bavail) as
>> returned by statvfs(3).  While on Windows guests that's all we can get
>> with GetDiskFreeSpaceExA(), on POSIX guests we might also be interested in
>> total file system size, as it's visible for root user.  Let's add an
>> optional field 'total-bytes-root' to GuestFilesystemInfo struct, which'd
>> only be reported on POSIX and represent f_blocks value as returned by
>> statvfs(3).
>>
>> While here, let's document better where those values come from in both
>> POSIX and Windows.
>>
>> Signed-off-by: Andrey Drobyshev 
>> ---
>>  qga/commands-posix.c |  2 ++
>>  qga/commands-win32.c |  1 +
>>  qga/qapi-schema.json | 12 +++-
>>  3 files changed, 14 insertions(+), 1 deletion(-)
>>
>> diff --git a/qga/commands-posix.c b/qga/commands-posix.c
>> index 26008db497..8207c4c47e 100644
>> --- a/qga/commands-posix.c
>> +++ b/qga/commands-posix.c
>> @@ -1569,8 +1569,10 @@ static GuestFilesystemInfo *build_guest_fsinfo(struct 
>> FsMount *mount,
>>  nonroot_total = used + buf.f_bavail;
>>  fs->used_bytes = used * fr_size;
>>  fs->total_bytes = nonroot_total * fr_size;
>> +fs->total_bytes_root = buf.f_blocks * fr_size;
>>  
>>  fs->has_total_bytes = true;
>> +fs->has_total_bytes_root = true;
>>  fs->has_used_bytes = true;
>>  }
>>  
>> diff --git a/qga/commands-win32.c b/qga/commands-win32.c
>> index a1015757d8..9e820aad8d 100644
>> --- a/qga/commands-win32.c
>> +++ b/qga/commands-win32.c
>> @@ -1143,6 +1143,7 @@ static GuestFilesystemInfo *build_guest_fsinfo(char 
>> *guid, Error **errp)
>>  fs = g_malloc(sizeof(*fs));
>>  fs->name = g_strdup(guid);
>>  fs->has_total_bytes = false;
>> +fs->has_total_bytes_root = false;
> 
> Can we use GetDiskSpaceInformationA to return this information
> on Windows ? In contrast to GetDiskFreeSpaceExA(), the
> DISK_SPACE_INFORMATION struct details both the real sizes
> and the current user available sizes.
> 

It seems that this API has only been included in mingw64 recently:
https://github.com/mingw-w64/mingw-w64/commit/66546556

Apparently since it happened there hasn't been a new release of mingw64,
so the latest version v11.0.1 still doesn't have it.  So I guess we have
no choice but to leave Win fields as-is for now and switch to the new
API later.

>>  fs->has_used_bytes = false;
>>  if (len == 0) {
>>  fs->mountpoint = g_strdup("System Reserved");
>> diff --git a/qga/qapi-schema.json b/qga/qapi-schema.json
>> index b8efe31897..093a5ab602 100644
>> --- a/qga/qapi-schema.json
>> +++ b/qga/qapi-schema.json
>> @@ -1031,8 +1031,18 @@
>>  # @type: file system type string
>>  #
>>  # @used-bytes: file system used bytes (since 3.0)
>> +# * POSIX: (f_blocks - f_bfree) * f_frsize, as returned by statvfs(3)
>> +# * Windows: (TotalNumberOfBytes - TotalNumberOfFreeBytes), as returned
>> +#   by GetDiskFreeSpaceEx()
>>  #
>>  # @total-bytes: non-root file system total bytes (since 3.0)
>> +# * POSIX: (f_blocks - f_bfree + f_bavail) * f_frsize, as returned by
>> +#   statvfs(3)
>> +# * Windows: TotalNumberOfBytes, as returned by GetDiskFreeSpaceEx()
>> +#
>> +# @total-bytes-root: total file system size in bytes (as visible for a
>> +# priviledged user) (since 8.3)
>> +# * POSIX only: (f_blocks * f_frsize), returned by statvfs(3)
> 
> I tend to wonder whether it is really a good idea to document
> our specific implementation details in the public API
> 
> I might suggest
> 
>   @total-bytes: filesystem capacity in bytes for unprivileged users
>   @total-bytes-root: filesystem capacity in bytes for privileged users
>

My initial intent was to get rid of the necessity to dig into the
sources in order to understand what those values mean.  But since we're
discussing changes in the implementation, I guess it is wise not to
mention those details indeed.

> also should we call it 'total-bytes-privilegd', to avoid UNIX specific
> terminology.
> 

Agreed.

>>  #
>>  # @disk: an array of disk hardware information that the volume lies
>>  # on, which may be empty if the disk type is not supported
>> @@ -1042,7 +1052,7 @@
>>  { 'struct': 'GuestFilesystemInfo',
>>'data': {'name': 'str', 'mountpoint': 'str', 'type': 'str',
>> '*used-bytes': 'uint64', '*total-bytes': 'uint64',
>> -   'disk': ['GuestDiskAddress']} }
>> +   '*total-bytes-root': 'uint64', 'disk': ['GuestDiskAddress']} }
>>  
>>  ##
>>  # @guest-get-fsinfo:
>> -- 
>> 2.39.3
>>
> 
> With regards,
> Daniel




Re: [PATCH v3 6/7] qga/commands-posix: don't do fork()/exec() when suspending via sysfs

2024-03-20 Thread Andrey Drobyshev
On 3/19/24 20:02, Daniel P. Berrangé wrote:
> On Fri, Mar 15, 2024 at 02:29:45PM +0200, Andrey Drobyshev wrote:
>> Since commit 246d76eba ("qga: guest_suspend: decoupling pm-utils and sys
>> logic") pm-utils logic is running in a separate child from the sysfs
>> logic.  Now when suspending via sysfs we don't really need to do that in
>> a separate process as we only need to perform one write to /sys/power/state.
>>
>> Let's just use g_file_set_contents() to simplify things here.
>>
>> Suggested-by: Daniel P. Berrangé 
>> Signed-off-by: Andrey Drobyshev 
>> ---
>>  qga/commands-posix.c | 41 +
>>  1 file changed, 5 insertions(+), 36 deletions(-)
> 
> Reviewed-by: Daniel P. Berrangé 
> 
>>
>> diff --git a/qga/commands-posix.c b/qga/commands-posix.c
>> index 610d225d30..e0ea377f65 100644
>> --- a/qga/commands-posix.c
>> +++ b/qga/commands-posix.c
>> @@ -1928,52 +1928,21 @@ static bool 
>> linux_sys_state_supports_mode(SuspendMode mode, Error **errp)
>>  
>>  static void linux_sys_state_suspend(SuspendMode mode, Error **errp)
>>  {
>> -Error *local_err = NULL;
>> +GError *local_gerr = NULL;
>>  const char *sysfile_strs[3] = {"disk", "mem", NULL};
>>  const char *sysfile_str = sysfile_strs[mode];
>> -pid_t pid;
>> -int status;
>>  
>>  if (!sysfile_str) {
>>  error_setg(errp, "unknown guest suspend mode");
>>  return;
>>  }
>>  
>> -pid = fork();
>> -if (!pid) {
>> -/* child */
>> -int fd;
>> -
>> -setsid();
>> -reopen_fd_to_null(0);
>> -reopen_fd_to_null(1);
>> -reopen_fd_to_null(2);
>> -
>> -fd = open(LINUX_SYS_STATE_FILE, O_WRONLY);
>> -if (fd < 0) {
>> -_exit(EXIT_FAILURE);
>> -}
>> -
>> -if (write(fd, sysfile_str, strlen(sysfile_str)) < 0) {
>> -_exit(EXIT_FAILURE);
>> -}
>> -
>> -_exit(EXIT_SUCCESS);
>> -} else if (pid < 0) {
>> -error_setg_errno(errp, errno, "failed to create child process");
>> -return;
>> -}
>> -
>> -ga_wait_child(pid, , _err);
>> -if (local_err) {
>> -error_propagate(errp, local_err);
>> +if (!g_file_set_contents(LINUX_SYS_STATE_FILE, sysfile_str,
>> + -1, _gerr)) {
>> +error_setg(errp, "suspend: cannot write to '%s': %s",
>> +   LINUX_SYS_STATE_FILE, local_gerr->message);
> 
> You need to declare with "g_autoptr(GError) local_gerr = NULL" to
> avoid a leak here.
> 

Of course, thanks for noticing.

>>  return;
>>  }
>> -
>> -if (WEXITSTATUS(status)) {
>> -error_setg(errp, "child process has failed to suspend");
>> -}
>> -
>>  }
>>  
>>  static void guest_suspend(SuspendMode mode, Error **errp)
>> -- 
>> 2.39.3
>>
> 
> With regards,
> Daniel




Re: [PATCH v3 1/7] qga: guest-get-fsinfo: add optional 'total-bytes-root' field

2024-03-15 Thread Andrey Drobyshev
On 3/15/24 15:44, Markus Armbruster wrote:
> [?? ??? ? ?? ?? arm...@redhat.com. ???, ?? ??? ?, 
> ?? ?? https://aka.ms/LearnAboutSenderIdentification ]
> 
> Andrey Drobyshev  writes:
> 
>> Since the commit 25b5ff1a86 ("qga: add mountpoint usage info to
>> GuestFilesystemInfo") we have 2 values reported in guest-get-fsinfo:
>> used = (f_blocks - f_bfree), total = (f_blocks - f_bfree + f_bavail) as
>> returned by statvfs(3).  While on Windows guests that's all we can get
>> with GetDiskFreeSpaceExA(), on POSIX guests we might also be interested in
>> total file system size, as it's visible for root user.  Let's add an
>> optional field 'total-bytes-root' to GuestFilesystemInfo struct, which'd
>> only be reported on POSIX and represent f_blocks value as returned by
>> statvfs(3).
>>
>> While here, let's document better where those values come from in both
>> POSIX and Windows.
>>
>> Signed-off-by: Andrey Drobyshev 
> 
> [...]
> 
>> diff --git a/qga/qapi-schema.json b/qga/qapi-schema.json
>> index b8efe31897..093a5ab602 100644
>> --- a/qga/qapi-schema.json
>> +++ b/qga/qapi-schema.json
>> @@ -1031,8 +1031,18 @@
>>  # @type: file system type string
>>  #
>>  # @used-bytes: file system used bytes (since 3.0)
>> +# * POSIX: (f_blocks - f_bfree) * f_frsize, as returned by statvfs(3)
>> +# * Windows: (TotalNumberOfBytes - TotalNumberOfFreeBytes), as returned
>> +#   by GetDiskFreeSpaceEx()
>>  #
>>  # @total-bytes: non-root file system total bytes (since 3.0)
>> +# * POSIX: (f_blocks - f_bfree + f_bavail) * f_frsize, as returned by
>> +#   statvfs(3)
>> +# * Windows: TotalNumberOfBytes, as returned by GetDiskFreeSpaceEx()
>> +#
>> +# @total-bytes-root: total file system size in bytes (as visible for a
>> +# priviledged user) (since 8.3)
> 
> privileged
> 
>> +# * POSIX only: (f_blocks * f_frsize), returned by statvfs(3)
>>  #
>>  # @disk: an array of disk hardware information that the volume lies
>>  # on, which may be empty if the disk type is not supported
>> @@ -1042,7 +1052,7 @@
>>  { 'struct': 'GuestFilesystemInfo',
>>'data': {'name': 'str', 'mountpoint': 'str', 'type': 'str',
>> '*used-bytes': 'uint64', '*total-bytes': 'uint64',
>> -   'disk': ['GuestDiskAddress']} }
>> +   '*total-bytes-root': 'uint64', 'disk': ['GuestDiskAddress']} }
>>
>>  ##
>>  # @guest-get-fsinfo:
> 
> Fails to build the manual:
> 
> qga/qapi-schema.json:1019:Unexpected indentation.
> 
> To fix, add blank lines before the lists, like this:
> 
># @used-bytes: file system used bytes (since 3.0)
>#
># * POSIX: (f_blocks - f_bfree) * f_frsize, as returned by
>#   statvfs(3)
># * Windows: (TotalNumberOfBytes - TotalNumberOfFreeBytes), as
>#   returned by GetDiskFreeSpaceEx()
>#
># @total-bytes: non-root file system total bytes (since 3.0)
>#
># * POSIX: (f_blocks - f_bfree + f_bavail) * f_frsize, as returned by
>#   statvfs(3)
># * Windows: TotalNumberOfBytes, as returned by GetDiskFreeSpaceEx()
>#
># @total-bytes-root: total file system size in bytes (as visible for a
># privileged user) (since 8.3)
>#
># * POSIX only: (f_blocks * f_frsize), returned by statvfs(3)
>#
> 
> Yes, reST can be quite annoying.
> 

For some reason in my environment build doesn't fail and file
build/docs/qemu-ga-ref.7 is produced.  However lists aren't indeed
formatted properly.  I'll wait for other patches to be reviewed and fix
that in v4.  Thank you for noticing.



[PATCH v3 5/7] qga/commands-posix: execute_fsfreeze_hook: use ga_run_command helper

2024-03-15 Thread Andrey Drobyshev
There's no need to check for the existence of the hook executable, as the
exec() call will do that for us.

Signed-off-by: Andrey Drobyshev 
---
 qga/commands-posix.c | 35 +++
 1 file changed, 3 insertions(+), 32 deletions(-)

diff --git a/qga/commands-posix.c b/qga/commands-posix.c
index 94b652d54e..610d225d30 100644
--- a/qga/commands-posix.c
+++ b/qga/commands-posix.c
@@ -736,8 +736,6 @@ static const char *fsfreeze_hook_arg_string[] = {
 
 static void execute_fsfreeze_hook(FsfreezeHookArg arg, Error **errp)
 {
-int status;
-pid_t pid;
 const char *hook;
 const char *arg_str = fsfreeze_hook_arg_string[arg];
 Error *local_err = NULL;
@@ -746,42 +744,15 @@ static void execute_fsfreeze_hook(FsfreezeHookArg arg, 
Error **errp)
 if (!hook) {
 return;
 }
-if (access(hook, X_OK) != 0) {
-error_setg_errno(errp, errno, "can't access fsfreeze hook '%s'", hook);
-return;
-}
 
-slog("executing fsfreeze hook with arg '%s'", arg_str);
-pid = fork();
-if (pid == 0) {
-setsid();
-reopen_fd_to_null(0);
-reopen_fd_to_null(1);
-reopen_fd_to_null(2);
-
-execl(hook, hook, arg_str, NULL);
-_exit(EXIT_FAILURE);
-} else if (pid < 0) {
-error_setg_errno(errp, errno, "failed to create child process");
-return;
-}
+const char *argv[] = {hook, arg_str, NULL};
 
-ga_wait_child(pid, , _err);
+slog("executing fsfreeze hook with arg '%s'", arg_str);
+ga_run_command(argv, NULL, "execute fsfreeze hook", _err);
 if (local_err) {
 error_propagate(errp, local_err);
 return;
 }
-
-if (!WIFEXITED(status)) {
-error_setg(errp, "fsfreeze hook has terminated abnormally");
-return;
-}
-
-status = WEXITSTATUS(status);
-if (status) {
-error_setg(errp, "fsfreeze hook has failed with status %d", status);
-return;
-}
 }
 
 /*
-- 
2.39.3




[PATCH v3 7/7] qga/commands-posix: qmp_guest_set_user_password: use ga_run_command helper

2024-03-15 Thread Andrey Drobyshev
There's no need to check for the existence of the "chpasswd", "pw"
executables, as the exec() call will do that for us.

Signed-off-by: Andrey Drobyshev 
---
 qga/commands-posix.c | 96 ++--
 1 file changed, 13 insertions(+), 83 deletions(-)

diff --git a/qga/commands-posix.c b/qga/commands-posix.c
index e0ea377f65..ffea69c3f0 100644
--- a/qga/commands-posix.c
+++ b/qga/commands-posix.c
@@ -2151,14 +2151,8 @@ void qmp_guest_set_user_password(const char *username,
  Error **errp)
 {
 Error *local_err = NULL;
-char *passwd_path = NULL;
-pid_t pid;
-int status;
-int datafd[2] = { -1, -1 };
-char *rawpasswddata = NULL;
+g_autofree char *rawpasswddata = NULL;
 size_t rawpasswdlen;
-char *chpasswddata = NULL;
-size_t chpasswdlen;
 
 rawpasswddata = (char *)qbase64_decode(password, -1, , errp);
 if (!rawpasswddata) {
@@ -2169,95 +2163,31 @@ void qmp_guest_set_user_password(const char *username,
 
 if (strchr(rawpasswddata, '\n')) {
 error_setg(errp, "forbidden characters in raw password");
-goto out;
+return;
 }
 
 if (strchr(username, '\n') ||
 strchr(username, ':')) {
 error_setg(errp, "forbidden characters in username");
-goto out;
+return;
 }
 
 #ifdef __FreeBSD__
-chpasswddata = g_strdup(rawpasswddata);
-passwd_path = g_find_program_in_path("pw");
+g_autofree char *chpasswdata = g_strdup(rawpasswddata);
+const char *crypt_flag = crypted ? "-H" : "-h";
+const char *argv[] = {"pw", "usermod", "-n", username,
+  crypt_flag, "0", NULL};
 #else
-chpasswddata = g_strdup_printf("%s:%s\n", username, rawpasswddata);
-passwd_path = g_find_program_in_path("chpasswd");
+g_autofree char *chpasswddata = g_strdup_printf("%s:%s\n", username,
+rawpasswddata);
+const char *crypt_flag = crypted ? "-e" : NULL;
+const char *argv[] = {"chpasswd", crypt_flag, NULL};
 #endif
 
-chpasswdlen = strlen(chpasswddata);
-
-if (!passwd_path) {
-error_setg(errp, "cannot find 'passwd' program in PATH");
-goto out;
-}
-
-if (!g_unix_open_pipe(datafd, FD_CLOEXEC, NULL)) {
-error_setg(errp, "cannot create pipe FDs");
-goto out;
-}
-
-pid = fork();
-if (pid == 0) {
-close(datafd[1]);
-/* child */
-setsid();
-dup2(datafd[0], 0);
-reopen_fd_to_null(1);
-reopen_fd_to_null(2);
-
-#ifdef __FreeBSD__
-const char *h_arg;
-h_arg = (crypted) ? "-H" : "-h";
-execl(passwd_path, "pw", "usermod", "-n", username, h_arg, "0", NULL);
-#else
-if (crypted) {
-execl(passwd_path, "chpasswd", "-e", NULL);
-} else {
-execl(passwd_path, "chpasswd", NULL);
-}
-#endif
-_exit(EXIT_FAILURE);
-} else if (pid < 0) {
-error_setg_errno(errp, errno, "failed to create child process");
-goto out;
-}
-close(datafd[0]);
-datafd[0] = -1;
-
-if (qemu_write_full(datafd[1], chpasswddata, chpasswdlen) != chpasswdlen) {
-error_setg_errno(errp, errno, "cannot write new account password");
-goto out;
-}
-close(datafd[1]);
-datafd[1] = -1;
-
-ga_wait_child(pid, , _err);
+ga_run_command(argv, chpasswddata, "set user password", _err);
 if (local_err) {
 error_propagate(errp, local_err);
-goto out;
-}
-
-if (!WIFEXITED(status)) {
-error_setg(errp, "child process has terminated abnormally");
-goto out;
-}
-
-if (WEXITSTATUS(status)) {
-error_setg(errp, "child process has failed to set user password");
-goto out;
-}
-
-out:
-g_free(chpasswddata);
-g_free(rawpasswddata);
-g_free(passwd_path);
-if (datafd[0] != -1) {
-close(datafd[0]);
-}
-if (datafd[1] != -1) {
-close(datafd[1]);
+return;
 }
 }
 #else /* __linux__ || __FreeBSD__ */
-- 
2.39.3




[PATCH v3 3/7] qga/commands-posix: qmp_guest_shutdown: use ga_run_command helper

2024-03-15 Thread Andrey Drobyshev
Also remove the G_GNUC_UNUSED attribute added in the previous commit from
the helper.

Signed-off-by: Andrey Drobyshev 
---
 qga/commands-posix.c | 39 ++-
 1 file changed, 6 insertions(+), 33 deletions(-)

diff --git a/qga/commands-posix.c b/qga/commands-posix.c
index ad05630086..d4025e0c1e 100644
--- a/qga/commands-posix.c
+++ b/qga/commands-posix.c
@@ -108,7 +108,6 @@ static ssize_t ga_pipe_read_str(int fd[2], char **str)
  * sending string to stdin and taking error message from
  * stdout/err.
  */
-G_GNUC_UNUSED
 static int ga_run_command(const char *argv[], const char *in_str,
   const char *action, Error **errp)
 {
@@ -230,8 +229,6 @@ void qmp_guest_shutdown(const char *mode, Error **errp)
 {
 const char *shutdown_flag;
 Error *local_err = NULL;
-pid_t pid;
-int status;
 
 #ifdef CONFIG_SOLARIS
 const char *powerdown_flag = "-i5";
@@ -260,46 +257,22 @@ void qmp_guest_shutdown(const char *mode, Error **errp)
 return;
 }
 
-pid = fork();
-if (pid == 0) {
-/* child, start the shutdown */
-setsid();
-reopen_fd_to_null(0);
-reopen_fd_to_null(1);
-reopen_fd_to_null(2);
-
+const char *argv[] = {"/sbin/shutdown",
 #ifdef CONFIG_SOLARIS
-execl("/sbin/shutdown", "shutdown", shutdown_flag, "-g0", "-y",
-  "hypervisor initiated shutdown", (char *)NULL);
+  shutdown_flag, "-g0", "-y",
 #elif defined(CONFIG_BSD)
-execl("/sbin/shutdown", "shutdown", shutdown_flag, "+0",
-   "hypervisor initiated shutdown", (char *)NULL);
+  shutdown_flag, "+0",
 #else
-execl("/sbin/shutdown", "shutdown", "-h", shutdown_flag, "+0",
-   "hypervisor initiated shutdown", (char *)NULL);
+  "-h", shutdown_flag, "+0",
 #endif
-_exit(EXIT_FAILURE);
-} else if (pid < 0) {
-error_setg_errno(errp, errno, "failed to create child process");
-return;
-}
+  "hypervisor initiated shutdown", (char *) NULL};
 
-ga_wait_child(pid, , _err);
+ga_run_command(argv, NULL, "shutdown", _err);
 if (local_err) {
 error_propagate(errp, local_err);
 return;
 }
 
-if (!WIFEXITED(status)) {
-error_setg(errp, "child process has terminated abnormally");
-return;
-}
-
-if (WEXITSTATUS(status)) {
-error_setg(errp, "child process has failed to shutdown");
-return;
-}
-
 /* succeeded */
 }
 
-- 
2.39.3




[PATCH v3 4/7] qga/commands-posix: qmp_guest_set_time: use ga_run_command helper

2024-03-15 Thread Andrey Drobyshev
There's no need to check for the existence of "/sbin/hwclock", the
exec() call will do that for us.

Signed-off-by: Andrey Drobyshev 
---
 qga/commands-posix.c | 43 +++
 1 file changed, 3 insertions(+), 40 deletions(-)

diff --git a/qga/commands-posix.c b/qga/commands-posix.c
index d4025e0c1e..94b652d54e 100644
--- a/qga/commands-posix.c
+++ b/qga/commands-posix.c
@@ -279,21 +279,9 @@ void qmp_guest_shutdown(const char *mode, Error **errp)
 void qmp_guest_set_time(bool has_time, int64_t time_ns, Error **errp)
 {
 int ret;
-int status;
-pid_t pid;
 Error *local_err = NULL;
 struct timeval tv;
-static const char hwclock_path[] = "/sbin/hwclock";
-static int hwclock_available = -1;
-
-if (hwclock_available < 0) {
-hwclock_available = (access(hwclock_path, X_OK) == 0);
-}
-
-if (!hwclock_available) {
-error_setg(errp, QERR_UNSUPPORTED);
-return;
-}
+const char *argv[] = {"/sbin/hwclock", has_time ? "-w" : "-s", NULL};
 
 /* If user has passed a time, validate and set it. */
 if (has_time) {
@@ -324,37 +312,12 @@ void qmp_guest_set_time(bool has_time, int64_t time_ns, 
Error **errp)
  * just need to synchronize the hardware clock. However, if no time was
  * passed, user is requesting the opposite: set the system time from the
  * hardware clock (RTC). */
-pid = fork();
-if (pid == 0) {
-setsid();
-reopen_fd_to_null(0);
-reopen_fd_to_null(1);
-reopen_fd_to_null(2);
-
-/* Use '/sbin/hwclock -w' to set RTC from the system time,
- * or '/sbin/hwclock -s' to set the system time from RTC. */
-execl(hwclock_path, "hwclock", has_time ? "-w" : "-s", NULL);
-_exit(EXIT_FAILURE);
-} else if (pid < 0) {
-error_setg_errno(errp, errno, "failed to create child process");
-return;
-}
-
-ga_wait_child(pid, , _err);
+ga_run_command(argv, NULL, "set hardware clock to system time",
+   _err);
 if (local_err) {
 error_propagate(errp, local_err);
 return;
 }
-
-if (!WIFEXITED(status)) {
-error_setg(errp, "child process has terminated abnormally");
-return;
-}
-
-if (WEXITSTATUS(status)) {
-error_setg(errp, "hwclock failed to set hardware clock to system 
time");
-return;
-}
 }
 
 typedef enum {
-- 
2.39.3




[PATCH v3 2/7] qga: introduce ga_run_command() helper for guest cmd execution

2024-03-15 Thread Andrey Drobyshev
When executing guest commands in *nix environment, we repeat the same
fork/exec pattern multiple times.  Let's just separate it into a single
helper which would also be able to feed input data into the launched
process' stdin.  This way we can avoid code duplication.

To keep the history more bisectable, let's replace qmp commands
implementations one by one.  Also add G_GNUC_UNUSED attribute to the
helper and remove it in the next commit.

Originally-by: Yuri Pudgorodskiy 
Signed-off-by: Andrey Drobyshev 
---
 qga/commands-posix.c | 150 +++
 1 file changed, 150 insertions(+)

diff --git a/qga/commands-posix.c b/qga/commands-posix.c
index 8207c4c47e..ad05630086 100644
--- a/qga/commands-posix.c
+++ b/qga/commands-posix.c
@@ -76,6 +76,156 @@ static void ga_wait_child(pid_t pid, int *status, Error 
**errp)
 g_assert(rpid == pid);
 }
 
+static ssize_t ga_pipe_read_str(int fd[2], char **str)
+{
+ssize_t n, len = 0;
+char buf[1024];
+
+close(fd[1]);
+fd[1] = -1;
+while ((n = read(fd[0], buf, sizeof(buf))) != 0) {
+if (n < 0) {
+if (errno == EINTR) {
+continue;
+} else {
+len = -errno;
+break;
+}
+}
+*str = g_realloc(*str, len + n + 1);
+memcpy(*str + len, buf, n);
+len += n;
+*str[len] = '\0';
+}
+close(fd[0]);
+fd[0] = -1;
+
+return len;
+}
+
+/*
+ * Helper to run command with input/output redirection,
+ * sending string to stdin and taking error message from
+ * stdout/err.
+ */
+G_GNUC_UNUSED
+static int ga_run_command(const char *argv[], const char *in_str,
+  const char *action, Error **errp)
+{
+pid_t pid;
+int status;
+int retcode = -1;
+int infd[2] = { -1, -1 };
+int outfd[2] = { -1, -1 };
+char *str = NULL;
+ssize_t len = 0;
+
+if ((in_str && !g_unix_open_pipe(infd, FD_CLOEXEC, NULL)) ||
+!g_unix_open_pipe(outfd, FD_CLOEXEC, NULL)) {
+error_setg(errp, "cannot create pipe FDs");
+goto out;
+}
+
+pid = fork();
+if (pid == 0) {
+char *cherr = NULL;
+
+setsid();
+
+if (in_str) {
+/* Redirect stdin to infd. */
+close(infd[1]);
+dup2(infd[0], 0);
+close(infd[0]);
+} else {
+reopen_fd_to_null(0);
+}
+
+/* Redirect stdout/stderr to outfd. */
+close(outfd[0]);
+dup2(outfd[1], 1);
+dup2(outfd[1], 2);
+close(outfd[1]);
+
+execvp(argv[0], (char *const *)argv);
+
+/* Write the cause of failed exec to pipe for the parent to read it. */
+cherr = g_strdup_printf("failed to exec '%s'", argv[0]);
+perror(cherr);
+g_free(cherr);
+_exit(EXIT_FAILURE);
+} else if (pid < 0) {
+error_setg_errno(errp, errno, "failed to create child process");
+goto out;
+}
+
+if (in_str) {
+close(infd[0]);
+infd[0] = -1;
+if (qemu_write_full(infd[1], in_str, strlen(in_str)) !=
+strlen(in_str)) {
+error_setg_errno(errp, errno, "%s: cannot write to stdin pipe",
+ action);
+goto out;
+}
+close(infd[1]);
+infd[1] = -1;
+}
+
+len = ga_pipe_read_str(outfd, );
+if (len < 0) {
+error_setg_errno(errp, -len, "%s: cannot read from stdout/stderr pipe",
+ action);
+goto out;
+}
+
+ga_wait_child(pid, , errp);
+if (*errp) {
+goto out;
+}
+
+if (!WIFEXITED(status)) {
+if (len) {
+error_setg(errp, "child process has terminated abnormally: %s",
+   str);
+} else {
+error_setg(errp, "child process has terminated abnormally");
+}
+goto out;
+}
+
+retcode = WEXITSTATUS(status);
+
+if (WEXITSTATUS(status)) {
+if (len) {
+error_setg(errp, "child process has failed to %s: %s",
+   action, str);
+} else {
+error_setg(errp, "child process has failed to %s: exit status %d",
+   action, WEXITSTATUS(status));
+}
+goto out;
+}
+
+out:
+g_free(str);
+
+if (infd[0] != -1) {
+close(infd[0]);
+}
+if (infd[1] != -1) {
+close(infd[1]);
+}
+if (outfd[0] != -1) {
+close(outfd[0]);
+}
+if (outfd[1] != -1) {
+close(outfd[1]);
+}
+
+return retcode;
+}
+
 void qmp_guest_shutdown(const char *mode, Error **errp)
 {
 const char *shutdown_flag;
-- 
2.39.3




[PATCH v3 6/7] qga/commands-posix: don't do fork()/exec() when suspending via sysfs

2024-03-15 Thread Andrey Drobyshev
Since commit 246d76eba ("qga: guest_suspend: decoupling pm-utils and sys
logic") pm-utils logic is running in a separate child from the sysfs
logic.  Now when suspending via sysfs we don't really need to do that in
a separate process as we only need to perform one write to /sys/power/state.

Let's just use g_file_set_contents() to simplify things here.

Suggested-by: Daniel P. Berrangé 
Signed-off-by: Andrey Drobyshev 
---
 qga/commands-posix.c | 41 +
 1 file changed, 5 insertions(+), 36 deletions(-)

diff --git a/qga/commands-posix.c b/qga/commands-posix.c
index 610d225d30..e0ea377f65 100644
--- a/qga/commands-posix.c
+++ b/qga/commands-posix.c
@@ -1928,52 +1928,21 @@ static bool linux_sys_state_supports_mode(SuspendMode 
mode, Error **errp)
 
 static void linux_sys_state_suspend(SuspendMode mode, Error **errp)
 {
-Error *local_err = NULL;
+GError *local_gerr = NULL;
 const char *sysfile_strs[3] = {"disk", "mem", NULL};
 const char *sysfile_str = sysfile_strs[mode];
-pid_t pid;
-int status;
 
 if (!sysfile_str) {
 error_setg(errp, "unknown guest suspend mode");
 return;
 }
 
-pid = fork();
-if (!pid) {
-/* child */
-int fd;
-
-setsid();
-reopen_fd_to_null(0);
-reopen_fd_to_null(1);
-reopen_fd_to_null(2);
-
-fd = open(LINUX_SYS_STATE_FILE, O_WRONLY);
-if (fd < 0) {
-_exit(EXIT_FAILURE);
-}
-
-if (write(fd, sysfile_str, strlen(sysfile_str)) < 0) {
-_exit(EXIT_FAILURE);
-}
-
-_exit(EXIT_SUCCESS);
-} else if (pid < 0) {
-error_setg_errno(errp, errno, "failed to create child process");
-return;
-}
-
-ga_wait_child(pid, , _err);
-if (local_err) {
-error_propagate(errp, local_err);
+if (!g_file_set_contents(LINUX_SYS_STATE_FILE, sysfile_str,
+ -1, _gerr)) {
+error_setg(errp, "suspend: cannot write to '%s': %s",
+   LINUX_SYS_STATE_FILE, local_gerr->message);
 return;
 }
-
-if (WEXITSTATUS(status)) {
-error_setg(errp, "child process has failed to suspend");
-}
-
 }
 
 static void guest_suspend(SuspendMode mode, Error **errp)
-- 
2.39.3




[PATCH v3 0/7] qga/commands-posix: replace code duplicating commands with a helper

2024-03-15 Thread Andrey Drobyshev
v2 -> v3:
  * Patch 2/7:
- ga_pipe_read_str() helper now returns -errno in case of an error
  during read from pipe, so that the caller may use it to set
  error_setg_errno();
- ga_pipe_read_str() allocates +1 additional byte to make the
  string read from pipe null-terminated on every iteration;
  * Patch 6/7: patch is rewritten to completely get rid of fork()/exec()
when suspending via sysfs, it now simply uses g_file_set_contents()
(suggested by Daniel);
  * Patch 7/7: cosmetic change: removed unneeded brackets in an
expression.

v2: https://lists.nongnu.org/archive/html/qemu-devel/2024-03/msg00147.html

Andrey Drobyshev (7):
  qga: guest-get-fsinfo: add optional 'total-bytes-root' field
  qga: introduce ga_run_command() helper for guest cmd execution
  qga/commands-posix: qmp_guest_shutdown: use ga_run_command helper
  qga/commands-posix: qmp_guest_set_time: use ga_run_command helper
  qga/commands-posix: execute_fsfreeze_hook: use ga_run_command helper
  qga/commands-posix: don't do fork()/exec() when suspending via sysfs
  qga/commands-posix: qmp_guest_set_user_password: use ga_run_command
helper

 qga/commands-posix.c | 404 +++
 qga/commands-win32.c |   1 +
 qga/qapi-schema.json |  12 +-
 3 files changed, 193 insertions(+), 224 deletions(-)

-- 
2.39.3




[PATCH v3 1/7] qga: guest-get-fsinfo: add optional 'total-bytes-root' field

2024-03-15 Thread Andrey Drobyshev
Since the commit 25b5ff1a86 ("qga: add mountpoint usage info to
GuestFilesystemInfo") we have 2 values reported in guest-get-fsinfo:
used = (f_blocks - f_bfree), total = (f_blocks - f_bfree + f_bavail) as
returned by statvfs(3).  While on Windows guests that's all we can get
with GetDiskFreeSpaceExA(), on POSIX guests we might also be interested in
total file system size, as it's visible for root user.  Let's add an
optional field 'total-bytes-root' to GuestFilesystemInfo struct, which'd
only be reported on POSIX and represent f_blocks value as returned by
statvfs(3).

While here, let's document better where those values come from in both
POSIX and Windows.

Signed-off-by: Andrey Drobyshev 
---
 qga/commands-posix.c |  2 ++
 qga/commands-win32.c |  1 +
 qga/qapi-schema.json | 12 +++-
 3 files changed, 14 insertions(+), 1 deletion(-)

diff --git a/qga/commands-posix.c b/qga/commands-posix.c
index 26008db497..8207c4c47e 100644
--- a/qga/commands-posix.c
+++ b/qga/commands-posix.c
@@ -1569,8 +1569,10 @@ static GuestFilesystemInfo *build_guest_fsinfo(struct 
FsMount *mount,
 nonroot_total = used + buf.f_bavail;
 fs->used_bytes = used * fr_size;
 fs->total_bytes = nonroot_total * fr_size;
+fs->total_bytes_root = buf.f_blocks * fr_size;
 
 fs->has_total_bytes = true;
+fs->has_total_bytes_root = true;
 fs->has_used_bytes = true;
 }
 
diff --git a/qga/commands-win32.c b/qga/commands-win32.c
index a1015757d8..9e820aad8d 100644
--- a/qga/commands-win32.c
+++ b/qga/commands-win32.c
@@ -1143,6 +1143,7 @@ static GuestFilesystemInfo *build_guest_fsinfo(char 
*guid, Error **errp)
 fs = g_malloc(sizeof(*fs));
 fs->name = g_strdup(guid);
 fs->has_total_bytes = false;
+fs->has_total_bytes_root = false;
 fs->has_used_bytes = false;
 if (len == 0) {
 fs->mountpoint = g_strdup("System Reserved");
diff --git a/qga/qapi-schema.json b/qga/qapi-schema.json
index b8efe31897..093a5ab602 100644
--- a/qga/qapi-schema.json
+++ b/qga/qapi-schema.json
@@ -1031,8 +1031,18 @@
 # @type: file system type string
 #
 # @used-bytes: file system used bytes (since 3.0)
+# * POSIX: (f_blocks - f_bfree) * f_frsize, as returned by statvfs(3)
+# * Windows: (TotalNumberOfBytes - TotalNumberOfFreeBytes), as returned
+#   by GetDiskFreeSpaceEx()
 #
 # @total-bytes: non-root file system total bytes (since 3.0)
+# * POSIX: (f_blocks - f_bfree + f_bavail) * f_frsize, as returned by
+#   statvfs(3)
+# * Windows: TotalNumberOfBytes, as returned by GetDiskFreeSpaceEx()
+#
+# @total-bytes-root: total file system size in bytes (as visible for a
+# priviledged user) (since 8.3)
+# * POSIX only: (f_blocks * f_frsize), returned by statvfs(3)
 #
 # @disk: an array of disk hardware information that the volume lies
 # on, which may be empty if the disk type is not supported
@@ -1042,7 +1052,7 @@
 { 'struct': 'GuestFilesystemInfo',
   'data': {'name': 'str', 'mountpoint': 'str', 'type': 'str',
'*used-bytes': 'uint64', '*total-bytes': 'uint64',
-   'disk': ['GuestDiskAddress']} }
+   '*total-bytes-root': 'uint64', 'disk': ['GuestDiskAddress']} }
 
 ##
 # @guest-get-fsinfo:
-- 
2.39.3




Re: [PATCH v2 6/7] qga/commands-posix: use ga_run_command helper when suspending via sysfs

2024-03-15 Thread Andrey Drobyshev
On 3/5/24 20:34, Daniel P. Berrangé wrote:
> [Вы нечасто получаете письма от berra...@redhat.com. Узнайте, почему это 
> важно, по адресу https://aka.ms/LearnAboutSenderIdentification ]
> 
> On Fri, Mar 01, 2024 at 07:28:57PM +0200, Andrey Drobyshev wrote:
>> We replace the direct call to open() with a "sh -c 'echo ...'" call, so
>> that it becomes an executable command.
> 
> Introduced an indirection via the shell is a significant step
> backwards IMHO.
> 
>>
>> Signed-off-by: Andrey Drobyshev 
>> ---
>>  qga/commands-posix.c | 36 
>>  1 file changed, 4 insertions(+), 32 deletions(-)
>>
>> diff --git a/qga/commands-posix.c b/qga/commands-posix.c
>> index dd2a7ad2e6..f3f4a05e2d 100644
>> --- a/qga/commands-posix.c
>> +++ b/qga/commands-posix.c
>> @@ -1921,49 +1921,21 @@ static void linux_sys_state_suspend(SuspendMode 
>> mode, Error **errp)
>>  Error *local_err = NULL;
>>  const char *sysfile_strs[3] = {"disk", "mem", NULL};
>>  const char *sysfile_str = sysfile_strs[mode];
>> -pid_t pid;
>> -int status;
>>
>>  if (!sysfile_str) {
>>  error_setg(errp, "unknown guest suspend mode");
>>  return;
>>  }
>>
>> -pid = fork();
>> -if (!pid) {
>> -/* child */
>> -int fd;
>> -
>> -setsid();
>> -reopen_fd_to_null(0);
>> -reopen_fd_to_null(1);
>> -reopen_fd_to_null(2);
>> -
>> -fd = open(LINUX_SYS_STATE_FILE, O_WRONLY);
>> -if (fd < 0) {
>> -_exit(EXIT_FAILURE);
>> -}
>> -
>> -if (write(fd, sysfile_str, strlen(sysfile_str)) < 0) {
>> -_exit(EXIT_FAILURE);
>> -}
>> -
>> -_exit(EXIT_SUCCESS);
> 
> 
> This pre-existing code is strange to me.  Why do we need to fork a
> new process in order to write to /sys/power/state ?
> 
> Looking at the original commit
> 
>   commit 11d0f1255bd5651f628280dc96c4ce9d63ae9236
>   Author: Luiz Capitulino 
>   Date:   Tue Feb 28 11:03:03 2012 -0300
> 
> qemu-ga: add guest-suspend-disk
> 
> 
> The code made a little more sense, as after fork() it first
> tried to execve  'pm-utils', and then had the sysfs codepath
> as a fallback. IOW having the sysfs code after fork() was a
> much easier code structure.
> 
> This was all changed in
> 
>   commit 246d76eba1944d7e59affb288ec27d7fcfb5d256
>   Author: Daniel Henrique Barboza 
>   Date:   Thu Jun 21 07:21:50 2018 -0300
> 
> qga: guest_suspend: decoupling pm-utils and sys logic
> 
> 
> so the pm-utils logic runs in a separate forked child
> from the sysfs logic.
> 
> AFAICT, that should have made it completely redundant to
> fork a process to access /sys/power/state.
> 
> Does anyone know of a reason to keep the fork() here ? Of
> not we should just be calling 'g_file_set_contents' without
> fork
> 

In the original commit message Luiz Capitulino explained that multiple
forks are needed to properly reap child processes needed for execl().
AFAIU since writing to /sys/power/state doesn't require execl(), fork()
isn't needed either.  I think the 2nd commit you mention simply kept
things as they were since it wasn't the original goal of the patch.  So
we're just looking at legacy code.

I agree that in this case my original patch is redundant and we may
replace it with smth like g_file_set_contents().  I'll add it in v3.

>> -} else if (pid < 0) {
>> -error_setg_errno(errp, errno, "failed to create child process");
>> -return;
>> -}
>> +g_autofree char *echo_cmd = g_strdup_printf(
>> +"echo %s > " LINUX_SYS_STATE_FILE, sysfile_str);
>> +const char *argv[] = {"sh", "-c", echo_cmd, NULL};
>>
>> -ga_wait_child(pid, , _err);
>> +ga_run_command(argv, NULL, "suspend", _err);
>>  if (local_err) {
>>  error_propagate(errp, local_err);
>>  return;
>>  }
>> -
>> -if (WEXITSTATUS(status)) {
>> -error_setg(errp, "child process has failed to suspend");
>> -}
>> -
>>  }
>>
>>  static void guest_suspend(SuspendMode mode, Error **errp)
>> --
>> 2.39.3
>>
>>
> 
> With regards,
> Daniel
> --
> |: https://berrange.com  -o-https://www.flickr.com/photos/dberrange :|
> |: https://libvirt.org -o-https://fstop138.berrange.com :|
> |: https://entangle-photo.org-o-https://www.instagram.com/dberrange :|
> 




Re: [PATCH v2 2/7] qga: introduce ga_run_command() helper for guest cmd execution

2024-03-15 Thread Andrey Drobyshev
On 3/5/24 19:58, Daniel P. Berrangé wrote:
> On Fri, Mar 01, 2024 at 07:28:53PM +0200, Andrey Drobyshev wrote:
>> When executing guest commands in *nix environment, we repeat the same
>> fork/exec pattern multiple times.  Let's just separate it into a single
>> helper which would also be able to feed input data into the launched
>> process' stdin.  This way we can avoid code duplication.
>>
>> To keep the history more bisectable, let's replace qmp commands
>> implementations one by one.  Also add G_GNUC_UNUSED attribute to the
>> helper and remove it in the next commit.
>>
>> Originally-by: Yuri Pudgorodskiy 
>> Signed-off-by: Andrey Drobyshev 
>> ---
>>  qga/commands-posix.c | 140 +++
>>  1 file changed, 140 insertions(+)
>>
>> diff --git a/qga/commands-posix.c b/qga/commands-posix.c
>> index 8207c4c47e..781498418f 100644
>> --- a/qga/commands-posix.c
>> +++ b/qga/commands-posix.c
>> @@ -76,6 +76,146 @@ static void ga_wait_child(pid_t pid, int *status, Error 
>> **errp)
>>  g_assert(rpid == pid);
>>  }
>>
>> +static void ga_pipe_read_str(int fd[2], char **str, size_t *len)
>> +{
>> +ssize_t n;
>> +char buf[1024];
>> +close(fd[1]);
>> +fd[1] = -1;
>> +while ((n = read(fd[0], buf, sizeof(buf))) != 0) {
>> +if (n < 0) {
>> +if (errno == EINTR) {
>> +continue;
>> +} else {
>> +break;
> 
> This is a fatal error condition
> 
>> +}
>> +}
>> +*str = g_realloc(*str, *len + n);
>> +memcpy(*str + *len, buf, n);
>> +*len += n;
>> +}
>> +close(fd[0]);
>> +fd[0] = -1;
> 
> and yet as far as the caller is concerned we're not indicating
> any sense of failure here. They're just being returned a partially
> read data stream as if it were successful. IMHO this needs to be
> reported properly.
>

Agreed.  We might make this helper return -errno in case of an erroneous
read from pipe, check the value in the caller and do error_setg_errno()
if smth went wrong.

> 
> Nothing in this method has NUL terminated 'str', so we are
> relying on the caller *always* honouring 'len', but.
>

Agreed as well.  Let's allocate +1 additional byte and store '\0' in
there on every iteration, making sure our result is always
null-terminated. That way we won't need to check 'len' in the caller.

>> +}
>> +
>> +/*
>> + * Helper to run command with input/output redirection,
>> + * sending string to stdin and taking error message from
>> + * stdout/err.
>> + */
>> +G_GNUC_UNUSED
>> +static int ga_run_command(const char *argv[], const char *in_str,
>> +  const char *action, Error **errp)
>> +{
>> +pid_t pid;
>> +int status;
>> +int retcode = -1;
>> +int infd[2] = { -1, -1 };
>> +int outfd[2] = { -1, -1 };
>> +char *str = NULL;
>> +size_t len = 0;
>> +
>> +if ((in_str && !g_unix_open_pipe(infd, FD_CLOEXEC, NULL)) ||
>> +!g_unix_open_pipe(outfd, FD_CLOEXEC, NULL)) {
>> +error_setg(errp, "cannot create pipe FDs");
>> +goto out;
>> +}
>> +
>> +pid = fork();
>> +if (pid == 0) {
>> +char *cherr = NULL;
>> +
>> +setsid();
>> +
>> +if (in_str) {
>> +/* Redirect stdin to infd. */
>> +close(infd[1]);
>> +dup2(infd[0], 0);
>> +close(infd[0]);
>> +} else {
>> +reopen_fd_to_null(0);
>> +}
>> +
>> +/* Redirect stdout/stderr to outfd. */
>> +close(outfd[0]);
>> +dup2(outfd[1], 1);
>> +dup2(outfd[1], 2);
>> +close(outfd[1]);
>> +
>> +execvp(argv[0], (char *const *)argv);
>> +
>> +/* Write the cause of failed exec to pipe for the parent to read 
>> it. */
>> +cherr = g_strdup_printf("failed to exec '%s'", argv[0]);
>> +perror(cherr);
>> +g_free(cherr);
>> +_exit(EXIT_FAILURE);
>> +} else if (pid < 0) {
>> +error_setg_errno(errp, errno, "failed to create child process");
>> +goto out;
>> +}
>> +
>> +if (in_str) {
>> +close(infd[0]);
>> +infd[0] = -1;
>> +if (qemu_write_full(infd[1], in_str, strlen(in_str)) !=
>> +   

Re: [PATCH v2 7/7] qga/commands-posix: qmp_guest_set_user_password: use ga_run_command helper

2024-03-15 Thread Andrey Drobyshev
On 3/5/24 20:38, Daniel P. Berrangé wrote:
> On Fri, Mar 01, 2024 at 07:28:58PM +0200, Andrey Drobyshev wrote:
>> There's no need to check for the existence of the "chpasswd", "pw"
>> executables, as the exec() call will do that for us.
>>
>> Signed-off-by: Andrey Drobyshev 
>> ---
>>  qga/commands-posix.c | 96 ++--
>>  1 file changed, 13 insertions(+), 83 deletions(-)
>>
>> diff --git a/qga/commands-posix.c b/qga/commands-posix.c
>> index f3f4a05e2d..f2e9496b80 100644
>> --- a/qga/commands-posix.c
>> +++ b/qga/commands-posix.c
>> @@ -2144,14 +2144,8 @@ void qmp_guest_set_user_password(const char *username,
>>   Error **errp)
>>  {
>>  Error *local_err = NULL;
>> -char *passwd_path = NULL;
>> -pid_t pid;
>> -int status;
>> -int datafd[2] = { -1, -1 };
>> -char *rawpasswddata = NULL;
>> +g_autofree char *rawpasswddata = NULL;
>>  size_t rawpasswdlen;
>> -char *chpasswddata = NULL;
>> -size_t chpasswdlen;
>>
>>  rawpasswddata = (char *)qbase64_decode(password, -1, , 
>> errp);
>>  if (!rawpasswddata) {
>> @@ -2162,95 +2156,31 @@ void qmp_guest_set_user_password(const char 
>> *username,
>>
>>  if (strchr(rawpasswddata, '\n')) {
>>  error_setg(errp, "forbidden characters in raw password");
>> -goto out;
>> +return;
>>  }
>>
>>  if (strchr(username, '\n') ||
>>  strchr(username, ':')) {
>>  error_setg(errp, "forbidden characters in username");
>> -goto out;
>> +return;
>>  }
>>
>>  #ifdef __FreeBSD__
>> -chpasswddata = g_strdup(rawpasswddata);
>> -passwd_path = g_find_program_in_path("pw");
>> +g_autofree char *chpasswdata = g_strdup(rawpasswddata);
>> +const char *crypt_flag = (crypted) ? "-H" : "-h";
>> +const char *argv[] = {"pw", "usermod", "-n", username,
>> +  crypt_flag, "0", NULL};
>>  #else
>> -chpasswddata = g_strdup_printf("%s:%s\n", username, rawpasswddata);
>> -passwd_path = g_find_program_in_path("chpasswd");
>> +g_autofree char *chpasswddata = g_strdup_printf("%s:%s\n", username,
>> +rawpasswddata);
>> +const char *crypt_flag = (crypted) ? "-e" : NULL;
> 
> Style nit-pick - no '(...)' around 'crypted' is needed here, or
> the other place later in this method.
> 
> Yes, that was a pre-existing issue, but since you're refactoring
> the code, might as well kill the redundant brackets.
> 
> [...]

Sure, let's get rid of them. Thanks.




[PATCH v2 4/7] qga/commands-posix: qmp_guest_set_time: use ga_run_command helper

2024-03-01 Thread Andrey Drobyshev
There's no need to check for the existence of "/sbin/hwclock", the
exec() call will do that for us.

Signed-off-by: Andrey Drobyshev 
---
 qga/commands-posix.c | 43 +++
 1 file changed, 3 insertions(+), 40 deletions(-)

diff --git a/qga/commands-posix.c b/qga/commands-posix.c
index 003054891f..e44203a273 100644
--- a/qga/commands-posix.c
+++ b/qga/commands-posix.c
@@ -269,21 +269,9 @@ void qmp_guest_shutdown(const char *mode, Error **errp)
 void qmp_guest_set_time(bool has_time, int64_t time_ns, Error **errp)
 {
 int ret;
-int status;
-pid_t pid;
 Error *local_err = NULL;
 struct timeval tv;
-static const char hwclock_path[] = "/sbin/hwclock";
-static int hwclock_available = -1;
-
-if (hwclock_available < 0) {
-hwclock_available = (access(hwclock_path, X_OK) == 0);
-}
-
-if (!hwclock_available) {
-error_setg(errp, QERR_UNSUPPORTED);
-return;
-}
+const char *argv[] = {"/sbin/hwclock", has_time ? "-w" : "-s", NULL};
 
 /* If user has passed a time, validate and set it. */
 if (has_time) {
@@ -314,37 +302,12 @@ void qmp_guest_set_time(bool has_time, int64_t time_ns, 
Error **errp)
  * just need to synchronize the hardware clock. However, if no time was
  * passed, user is requesting the opposite: set the system time from the
  * hardware clock (RTC). */
-pid = fork();
-if (pid == 0) {
-setsid();
-reopen_fd_to_null(0);
-reopen_fd_to_null(1);
-reopen_fd_to_null(2);
-
-/* Use '/sbin/hwclock -w' to set RTC from the system time,
- * or '/sbin/hwclock -s' to set the system time from RTC. */
-execl(hwclock_path, "hwclock", has_time ? "-w" : "-s", NULL);
-_exit(EXIT_FAILURE);
-} else if (pid < 0) {
-error_setg_errno(errp, errno, "failed to create child process");
-return;
-}
-
-ga_wait_child(pid, , _err);
+ga_run_command(argv, NULL, "set hardware clock to system time",
+   _err);
 if (local_err) {
 error_propagate(errp, local_err);
 return;
 }
-
-if (!WIFEXITED(status)) {
-error_setg(errp, "child process has terminated abnormally");
-return;
-}
-
-if (WEXITSTATUS(status)) {
-error_setg(errp, "hwclock failed to set hardware clock to system 
time");
-return;
-}
 }
 
 typedef enum {
-- 
2.39.3




[PATCH v2 3/7] qga/commands-posix: qmp_guest_shutdown: use ga_run_command helper

2024-03-01 Thread Andrey Drobyshev
Also remove the G_GNUC_UNUSED attribute added in the previous commit from
the helper.

Signed-off-by: Andrey Drobyshev 
---
 qga/commands-posix.c | 39 ++-
 1 file changed, 6 insertions(+), 33 deletions(-)

diff --git a/qga/commands-posix.c b/qga/commands-posix.c
index 781498418f..003054891f 100644
--- a/qga/commands-posix.c
+++ b/qga/commands-posix.c
@@ -103,7 +103,6 @@ static void ga_pipe_read_str(int fd[2], char **str, size_t 
*len)
  * sending string to stdin and taking error message from
  * stdout/err.
  */
-G_GNUC_UNUSED
 static int ga_run_command(const char *argv[], const char *in_str,
   const char *action, Error **errp)
 {
@@ -220,8 +219,6 @@ void qmp_guest_shutdown(const char *mode, Error **errp)
 {
 const char *shutdown_flag;
 Error *local_err = NULL;
-pid_t pid;
-int status;
 
 #ifdef CONFIG_SOLARIS
 const char *powerdown_flag = "-i5";
@@ -250,46 +247,22 @@ void qmp_guest_shutdown(const char *mode, Error **errp)
 return;
 }
 
-pid = fork();
-if (pid == 0) {
-/* child, start the shutdown */
-setsid();
-reopen_fd_to_null(0);
-reopen_fd_to_null(1);
-reopen_fd_to_null(2);
-
+const char *argv[] = {"/sbin/shutdown",
 #ifdef CONFIG_SOLARIS
-execl("/sbin/shutdown", "shutdown", shutdown_flag, "-g0", "-y",
-  "hypervisor initiated shutdown", (char *)NULL);
+  shutdown_flag, "-g0", "-y",
 #elif defined(CONFIG_BSD)
-execl("/sbin/shutdown", "shutdown", shutdown_flag, "+0",
-   "hypervisor initiated shutdown", (char *)NULL);
+  shutdown_flag, "+0",
 #else
-execl("/sbin/shutdown", "shutdown", "-h", shutdown_flag, "+0",
-   "hypervisor initiated shutdown", (char *)NULL);
+  "-h", shutdown_flag, "+0",
 #endif
-_exit(EXIT_FAILURE);
-} else if (pid < 0) {
-error_setg_errno(errp, errno, "failed to create child process");
-return;
-}
+  "hypervisor initiated shutdown", (char *) NULL};
 
-ga_wait_child(pid, , _err);
+ga_run_command(argv, NULL, "shutdown", _err);
 if (local_err) {
 error_propagate(errp, local_err);
 return;
 }
 
-if (!WIFEXITED(status)) {
-error_setg(errp, "child process has terminated abnormally");
-return;
-}
-
-if (WEXITSTATUS(status)) {
-error_setg(errp, "child process has failed to shutdown");
-return;
-}
-
 /* succeeded */
 }
 
-- 
2.39.3




[PATCH v2 0/7] qga/commands-posix: replace code duplicating commands with a helper

2024-03-01 Thread Andrey Drobyshev
v1 --> v2:
 * Replace commit for guest-get-fsinfo: instead of reporting statvfs(3)
   values directly, keep the old ones but add an additional optional
   field 'total-bytes-root', only reported in POSIX.  Also tweak the
   fields description in qga/qapi-schema.json to document where the
   values come from.

v1: https://lists.nongnu.org/archive/html/qemu-devel/2024-02/msg05766.html

Andrey Drobyshev (7):
  qga: guest-get-fsinfo: add optional 'total-bytes-root' field
  qga: introduce ga_run_command() helper for guest cmd execution
  qga/commands-posix: qmp_guest_shutdown: use ga_run_command helper
  qga/commands-posix: qmp_guest_set_time: use ga_run_command helper
  qga/commands-posix: execute_fsfreeze_hook: use ga_run_command helper
  qga/commands-posix: use ga_run_command helper when suspending via
sysfs
  qga/commands-posix: qmp_guest_set_user_password: use ga_run_command
helper

 qga/commands-posix.c | 389 +++
 qga/commands-win32.c |   1 +
 qga/qapi-schema.json |  12 +-
 3 files changed, 182 insertions(+), 220 deletions(-)

-- 
2.39.3




[PATCH v2 2/7] qga: introduce ga_run_command() helper for guest cmd execution

2024-03-01 Thread Andrey Drobyshev
When executing guest commands in *nix environment, we repeat the same
fork/exec pattern multiple times.  Let's just separate it into a single
helper which would also be able to feed input data into the launched
process' stdin.  This way we can avoid code duplication.

To keep the history more bisectable, let's replace qmp commands
implementations one by one.  Also add G_GNUC_UNUSED attribute to the
helper and remove it in the next commit.

Originally-by: Yuri Pudgorodskiy 
Signed-off-by: Andrey Drobyshev 
---
 qga/commands-posix.c | 140 +++
 1 file changed, 140 insertions(+)

diff --git a/qga/commands-posix.c b/qga/commands-posix.c
index 8207c4c47e..781498418f 100644
--- a/qga/commands-posix.c
+++ b/qga/commands-posix.c
@@ -76,6 +76,146 @@ static void ga_wait_child(pid_t pid, int *status, Error 
**errp)
 g_assert(rpid == pid);
 }
 
+static void ga_pipe_read_str(int fd[2], char **str, size_t *len)
+{
+ssize_t n;
+char buf[1024];
+close(fd[1]);
+fd[1] = -1;
+while ((n = read(fd[0], buf, sizeof(buf))) != 0) {
+if (n < 0) {
+if (errno == EINTR) {
+continue;
+} else {
+break;
+}
+}
+*str = g_realloc(*str, *len + n);
+memcpy(*str + *len, buf, n);
+*len += n;
+}
+close(fd[0]);
+fd[0] = -1;
+}
+
+/*
+ * Helper to run command with input/output redirection,
+ * sending string to stdin and taking error message from
+ * stdout/err.
+ */
+G_GNUC_UNUSED
+static int ga_run_command(const char *argv[], const char *in_str,
+  const char *action, Error **errp)
+{
+pid_t pid;
+int status;
+int retcode = -1;
+int infd[2] = { -1, -1 };
+int outfd[2] = { -1, -1 };
+char *str = NULL;
+size_t len = 0;
+
+if ((in_str && !g_unix_open_pipe(infd, FD_CLOEXEC, NULL)) ||
+!g_unix_open_pipe(outfd, FD_CLOEXEC, NULL)) {
+error_setg(errp, "cannot create pipe FDs");
+goto out;
+}
+
+pid = fork();
+if (pid == 0) {
+char *cherr = NULL;
+
+setsid();
+
+if (in_str) {
+/* Redirect stdin to infd. */
+close(infd[1]);
+dup2(infd[0], 0);
+close(infd[0]);
+} else {
+reopen_fd_to_null(0);
+}
+
+/* Redirect stdout/stderr to outfd. */
+close(outfd[0]);
+dup2(outfd[1], 1);
+dup2(outfd[1], 2);
+close(outfd[1]);
+
+execvp(argv[0], (char *const *)argv);
+
+/* Write the cause of failed exec to pipe for the parent to read it. */
+cherr = g_strdup_printf("failed to exec '%s'", argv[0]);
+perror(cherr);
+g_free(cherr);
+_exit(EXIT_FAILURE);
+} else if (pid < 0) {
+error_setg_errno(errp, errno, "failed to create child process");
+goto out;
+}
+
+if (in_str) {
+close(infd[0]);
+infd[0] = -1;
+if (qemu_write_full(infd[1], in_str, strlen(in_str)) !=
+strlen(in_str)) {
+error_setg_errno(errp, errno, "%s: cannot write to stdin pipe",
+ action);
+goto out;
+}
+close(infd[1]);
+infd[1] = -1;
+}
+
+ga_pipe_read_str(outfd, , );
+
+ga_wait_child(pid, , errp);
+if (*errp) {
+goto out;
+}
+
+if (!WIFEXITED(status)) {
+if (len) {
+error_setg(errp, "child process has terminated abnormally: %s",
+   str);
+} else {
+error_setg(errp, "child process has terminated abnormally");
+}
+goto out;
+}
+
+retcode = WEXITSTATUS(status);
+
+if (WEXITSTATUS(status)) {
+if (len) {
+error_setg(errp, "child process has failed to %s: %s",
+   action, str);
+} else {
+error_setg(errp, "child process has failed to %s: exit status %d",
+   action, WEXITSTATUS(status));
+}
+goto out;
+}
+
+out:
+g_free(str);
+
+if (infd[0] != -1) {
+close(infd[0]);
+}
+if (infd[1] != -1) {
+close(infd[1]);
+}
+if (outfd[0] != -1) {
+close(outfd[0]);
+}
+if (outfd[1] != -1) {
+close(outfd[1]);
+}
+
+return retcode;
+}
+
 void qmp_guest_shutdown(const char *mode, Error **errp)
 {
 const char *shutdown_flag;
-- 
2.39.3




[PATCH v2 7/7] qga/commands-posix: qmp_guest_set_user_password: use ga_run_command helper

2024-03-01 Thread Andrey Drobyshev
There's no need to check for the existence of the "chpasswd", "pw"
executables, as the exec() call will do that for us.

Signed-off-by: Andrey Drobyshev 
---
 qga/commands-posix.c | 96 ++--
 1 file changed, 13 insertions(+), 83 deletions(-)

diff --git a/qga/commands-posix.c b/qga/commands-posix.c
index f3f4a05e2d..f2e9496b80 100644
--- a/qga/commands-posix.c
+++ b/qga/commands-posix.c
@@ -2144,14 +2144,8 @@ void qmp_guest_set_user_password(const char *username,
  Error **errp)
 {
 Error *local_err = NULL;
-char *passwd_path = NULL;
-pid_t pid;
-int status;
-int datafd[2] = { -1, -1 };
-char *rawpasswddata = NULL;
+g_autofree char *rawpasswddata = NULL;
 size_t rawpasswdlen;
-char *chpasswddata = NULL;
-size_t chpasswdlen;
 
 rawpasswddata = (char *)qbase64_decode(password, -1, , errp);
 if (!rawpasswddata) {
@@ -2162,95 +2156,31 @@ void qmp_guest_set_user_password(const char *username,
 
 if (strchr(rawpasswddata, '\n')) {
 error_setg(errp, "forbidden characters in raw password");
-goto out;
+return;
 }
 
 if (strchr(username, '\n') ||
 strchr(username, ':')) {
 error_setg(errp, "forbidden characters in username");
-goto out;
+return;
 }
 
 #ifdef __FreeBSD__
-chpasswddata = g_strdup(rawpasswddata);
-passwd_path = g_find_program_in_path("pw");
+g_autofree char *chpasswdata = g_strdup(rawpasswddata);
+const char *crypt_flag = (crypted) ? "-H" : "-h";
+const char *argv[] = {"pw", "usermod", "-n", username,
+  crypt_flag, "0", NULL};
 #else
-chpasswddata = g_strdup_printf("%s:%s\n", username, rawpasswddata);
-passwd_path = g_find_program_in_path("chpasswd");
+g_autofree char *chpasswddata = g_strdup_printf("%s:%s\n", username,
+rawpasswddata);
+const char *crypt_flag = (crypted) ? "-e" : NULL;
+const char *argv[] = {"chpasswd", crypt_flag, NULL};
 #endif
 
-chpasswdlen = strlen(chpasswddata);
-
-if (!passwd_path) {
-error_setg(errp, "cannot find 'passwd' program in PATH");
-goto out;
-}
-
-if (!g_unix_open_pipe(datafd, FD_CLOEXEC, NULL)) {
-error_setg(errp, "cannot create pipe FDs");
-goto out;
-}
-
-pid = fork();
-if (pid == 0) {
-close(datafd[1]);
-/* child */
-setsid();
-dup2(datafd[0], 0);
-reopen_fd_to_null(1);
-reopen_fd_to_null(2);
-
-#ifdef __FreeBSD__
-const char *h_arg;
-h_arg = (crypted) ? "-H" : "-h";
-execl(passwd_path, "pw", "usermod", "-n", username, h_arg, "0", NULL);
-#else
-if (crypted) {
-execl(passwd_path, "chpasswd", "-e", NULL);
-} else {
-execl(passwd_path, "chpasswd", NULL);
-}
-#endif
-_exit(EXIT_FAILURE);
-} else if (pid < 0) {
-error_setg_errno(errp, errno, "failed to create child process");
-goto out;
-}
-close(datafd[0]);
-datafd[0] = -1;
-
-if (qemu_write_full(datafd[1], chpasswddata, chpasswdlen) != chpasswdlen) {
-error_setg_errno(errp, errno, "cannot write new account password");
-goto out;
-}
-close(datafd[1]);
-datafd[1] = -1;
-
-ga_wait_child(pid, , _err);
+ga_run_command(argv, chpasswddata, "set user password", _err);
 if (local_err) {
 error_propagate(errp, local_err);
-goto out;
-}
-
-if (!WIFEXITED(status)) {
-error_setg(errp, "child process has terminated abnormally");
-goto out;
-}
-
-if (WEXITSTATUS(status)) {
-error_setg(errp, "child process has failed to set user password");
-goto out;
-}
-
-out:
-g_free(chpasswddata);
-g_free(rawpasswddata);
-g_free(passwd_path);
-if (datafd[0] != -1) {
-close(datafd[0]);
-}
-if (datafd[1] != -1) {
-close(datafd[1]);
+return;
 }
 }
 #else /* __linux__ || __FreeBSD__ */
-- 
2.39.3




[PATCH v2 1/7] qga: guest-get-fsinfo: add optional 'total-bytes-root' field

2024-03-01 Thread Andrey Drobyshev
Since the commit 25b5ff1a86 ("qga: add mountpoint usage info to
GuestFilesystemInfo") we have 2 values reported in guest-get-fsinfo:
used = (f_blocks - f_bfree), total = (f_blocks - f_bfree + f_bavail) as
returned by statvfs(3).  While on Windows guests that's all we can get
with GetDiskFreeSpaceExA(), on POSIX guests we might also be interested in
total file system size, as it's visible for root user.  Let's add an
optional field 'total-bytes-root' to GuestFilesystemInfo struct, which'd
only be reported on POSIX and represent f_blocks value as returned by
statvfs(3).

While here, let's document better where those values come from in both
POSIX and Windows.

Signed-off-by: Andrey Drobyshev 
---
 qga/commands-posix.c |  2 ++
 qga/commands-win32.c |  1 +
 qga/qapi-schema.json | 12 +++-
 3 files changed, 14 insertions(+), 1 deletion(-)

diff --git a/qga/commands-posix.c b/qga/commands-posix.c
index 26008db497..8207c4c47e 100644
--- a/qga/commands-posix.c
+++ b/qga/commands-posix.c
@@ -1569,8 +1569,10 @@ static GuestFilesystemInfo *build_guest_fsinfo(struct 
FsMount *mount,
 nonroot_total = used + buf.f_bavail;
 fs->used_bytes = used * fr_size;
 fs->total_bytes = nonroot_total * fr_size;
+fs->total_bytes_root = buf.f_blocks * fr_size;
 
 fs->has_total_bytes = true;
+fs->has_total_bytes_root = true;
 fs->has_used_bytes = true;
 }
 
diff --git a/qga/commands-win32.c b/qga/commands-win32.c
index a1015757d8..9e820aad8d 100644
--- a/qga/commands-win32.c
+++ b/qga/commands-win32.c
@@ -1143,6 +1143,7 @@ static GuestFilesystemInfo *build_guest_fsinfo(char 
*guid, Error **errp)
 fs = g_malloc(sizeof(*fs));
 fs->name = g_strdup(guid);
 fs->has_total_bytes = false;
+fs->has_total_bytes_root = false;
 fs->has_used_bytes = false;
 if (len == 0) {
 fs->mountpoint = g_strdup("System Reserved");
diff --git a/qga/qapi-schema.json b/qga/qapi-schema.json
index b8efe31897..093a5ab602 100644
--- a/qga/qapi-schema.json
+++ b/qga/qapi-schema.json
@@ -1031,8 +1031,18 @@
 # @type: file system type string
 #
 # @used-bytes: file system used bytes (since 3.0)
+# * POSIX: (f_blocks - f_bfree) * f_frsize, as returned by statvfs(3)
+# * Windows: (TotalNumberOfBytes - TotalNumberOfFreeBytes), as returned
+#   by GetDiskFreeSpaceEx()
 #
 # @total-bytes: non-root file system total bytes (since 3.0)
+# * POSIX: (f_blocks - f_bfree + f_bavail) * f_frsize, as returned by
+#   statvfs(3)
+# * Windows: TotalNumberOfBytes, as returned by GetDiskFreeSpaceEx()
+#
+# @total-bytes-root: total file system size in bytes (as visible for a
+# priviledged user) (since 8.3)
+# * POSIX only: (f_blocks * f_frsize), returned by statvfs(3)
 #
 # @disk: an array of disk hardware information that the volume lies
 # on, which may be empty if the disk type is not supported
@@ -1042,7 +1052,7 @@
 { 'struct': 'GuestFilesystemInfo',
   'data': {'name': 'str', 'mountpoint': 'str', 'type': 'str',
'*used-bytes': 'uint64', '*total-bytes': 'uint64',
-   'disk': ['GuestDiskAddress']} }
+   '*total-bytes-root': 'uint64', 'disk': ['GuestDiskAddress']} }
 
 ##
 # @guest-get-fsinfo:
-- 
2.39.3




[PATCH v2 6/7] qga/commands-posix: use ga_run_command helper when suspending via sysfs

2024-03-01 Thread Andrey Drobyshev
We replace the direct call to open() with a "sh -c 'echo ...'" call, so
that it becomes an executable command.

Signed-off-by: Andrey Drobyshev 
---
 qga/commands-posix.c | 36 
 1 file changed, 4 insertions(+), 32 deletions(-)

diff --git a/qga/commands-posix.c b/qga/commands-posix.c
index dd2a7ad2e6..f3f4a05e2d 100644
--- a/qga/commands-posix.c
+++ b/qga/commands-posix.c
@@ -1921,49 +1921,21 @@ static void linux_sys_state_suspend(SuspendMode mode, 
Error **errp)
 Error *local_err = NULL;
 const char *sysfile_strs[3] = {"disk", "mem", NULL};
 const char *sysfile_str = sysfile_strs[mode];
-pid_t pid;
-int status;
 
 if (!sysfile_str) {
 error_setg(errp, "unknown guest suspend mode");
 return;
 }
 
-pid = fork();
-if (!pid) {
-/* child */
-int fd;
-
-setsid();
-reopen_fd_to_null(0);
-reopen_fd_to_null(1);
-reopen_fd_to_null(2);
-
-fd = open(LINUX_SYS_STATE_FILE, O_WRONLY);
-if (fd < 0) {
-_exit(EXIT_FAILURE);
-}
-
-if (write(fd, sysfile_str, strlen(sysfile_str)) < 0) {
-_exit(EXIT_FAILURE);
-}
-
-_exit(EXIT_SUCCESS);
-} else if (pid < 0) {
-error_setg_errno(errp, errno, "failed to create child process");
-return;
-}
+g_autofree char *echo_cmd = g_strdup_printf(
+"echo %s > " LINUX_SYS_STATE_FILE, sysfile_str);
+const char *argv[] = {"sh", "-c", echo_cmd, NULL};
 
-ga_wait_child(pid, , _err);
+ga_run_command(argv, NULL, "suspend", _err);
 if (local_err) {
 error_propagate(errp, local_err);
 return;
 }
-
-if (WEXITSTATUS(status)) {
-error_setg(errp, "child process has failed to suspend");
-}
-
 }
 
 static void guest_suspend(SuspendMode mode, Error **errp)
-- 
2.39.3




[PATCH v2 5/7] qga/commands-posix: execute_fsfreeze_hook: use ga_run_command helper

2024-03-01 Thread Andrey Drobyshev
There's no need to check for the existence of the hook executable, as the
exec() call will do that for us.

Signed-off-by: Andrey Drobyshev 
---
 qga/commands-posix.c | 35 +++
 1 file changed, 3 insertions(+), 32 deletions(-)

diff --git a/qga/commands-posix.c b/qga/commands-posix.c
index e44203a273..dd2a7ad2e6 100644
--- a/qga/commands-posix.c
+++ b/qga/commands-posix.c
@@ -726,8 +726,6 @@ static const char *fsfreeze_hook_arg_string[] = {
 
 static void execute_fsfreeze_hook(FsfreezeHookArg arg, Error **errp)
 {
-int status;
-pid_t pid;
 const char *hook;
 const char *arg_str = fsfreeze_hook_arg_string[arg];
 Error *local_err = NULL;
@@ -736,42 +734,15 @@ static void execute_fsfreeze_hook(FsfreezeHookArg arg, 
Error **errp)
 if (!hook) {
 return;
 }
-if (access(hook, X_OK) != 0) {
-error_setg_errno(errp, errno, "can't access fsfreeze hook '%s'", hook);
-return;
-}
 
-slog("executing fsfreeze hook with arg '%s'", arg_str);
-pid = fork();
-if (pid == 0) {
-setsid();
-reopen_fd_to_null(0);
-reopen_fd_to_null(1);
-reopen_fd_to_null(2);
-
-execl(hook, hook, arg_str, NULL);
-_exit(EXIT_FAILURE);
-} else if (pid < 0) {
-error_setg_errno(errp, errno, "failed to create child process");
-return;
-}
+const char *argv[] = {hook, arg_str, NULL};
 
-ga_wait_child(pid, , _err);
+slog("executing fsfreeze hook with arg '%s'", arg_str);
+ga_run_command(argv, NULL, "execute fsfreeze hook", _err);
 if (local_err) {
 error_propagate(errp, local_err);
 return;
 }
-
-if (!WIFEXITED(status)) {
-error_setg(errp, "fsfreeze hook has terminated abnormally");
-return;
-}
-
-status = WEXITSTATUS(status);
-if (status) {
-error_setg(errp, "fsfreeze hook has failed with status %d", status);
-return;
-}
 }
 
 /*
-- 
2.39.3




Re: [PATCH 1/7] qga/commands-posix: return fsinfo values directly as reported by statvfs

2024-03-01 Thread Andrey Drobyshev
On 2/28/24 09:55, Marc-André Lureau wrote:
> [Вы нечасто получаете письма от marcandre.lur...@redhat.com. Узнайте, почему 
> это важно, по адресу https://aka.ms/LearnAboutSenderIdentification ]
> 
> Hi
> 
> On Tue, Feb 27, 2024 at 4:38 PM Andrey Drobyshev
>  wrote:
>>
>>
>>
>> On 2/26/24 20:50, Konstantin Kostiuk wrote:
>>>
>>> Best Regards,
>>> Konstantin Kostiuk.
>>>
>>>
>>> On Mon, Feb 26, 2024 at 7:02 PM Andrey Drobyshev
>>> mailto:andrey.drobys...@virtuozzo.com>>
>>> wrote:
>>>
>>> Since the commit 25b5ff1a86 ("qga: add mountpoint usage info to
>>> GuestFilesystemInfo") we have 2 values reported in guest-get-fsinfo:
>>> used = (f_blocks - f_bfree), total = (f_blocks - f_bfree + f_bavail).
>>> These calculations might be obscure for the end user and require one to
>>> actually get into QGA source to understand how they're obtained. Let's
>>> just report the values f_blocks, f_bfree, f_bavail (in bytes) from
>>> statvfs() as they are, letting the user decide how to process them
>>> further.
>>>
>>> Originally-by: Yuri Pudgorodskiy >> <mailto:y...@virtuozzo.com>>
>>> Signed-off-by: Andrey Drobyshev >> <mailto:andrey.drobys...@virtuozzo.com>>
>>> ---
>>>  qga/commands-posix.c | 16 +++-
>>>  qga/qapi-schema.json | 11 +++
>>>  2 files changed, 14 insertions(+), 13 deletions(-)
>>>
>>> diff --git a/qga/commands-posix.c b/qga/commands-posix.c
>>> index 26008db497..752ef509d0 100644
>>> --- a/qga/commands-posix.c
>>> +++ b/qga/commands-posix.c
>>> @@ -1554,8 +1554,7 @@ static GuestFilesystemInfo
>>> *build_guest_fsinfo(strua5a0239ce5ct FsMount *mount,
>>> Error **errp)
>>>  {
>>>  GuestFilesystemInfo *fs = g_malloc0(sizeof(*fs));
>>> -struct statvfs buf;
>>> -unsigned long used, nonroot_total, fr_size;
>>> +struct statvfs st;
>>>  char *devpath = g_strdup_printf("/sys/dev/block/%u:%u",
>>>  mount->devmajor, mount->devminor);
>>>
>>> @@ -1563,15 +1562,14 @@ static GuestFilesystemInfo
>>> *build_guest_fsinfo(struct FsMount *mount,
>>>  fs->type = g_strdup(mount->devtype);
>>>  build_guest_fsinfo_for_device(devpath, fs, errp);
>>>
>>> -if (statvfs(fs->mountpoint, ) == 0) {
>>> -fr_size = buf.f_frsize;
>>> -used = buf.f_blocks - buf.f_bfree;
>>> -nonroot_total = used + buf.f_bavail;
>>> -fs->used_bytes = used * fr_size;
>>> -fs->total_bytes = nonroot_total * fr_size;
>>> +if (statvfs(fs->mountpoint, ) == 0) {
>>> +fs->total_bytes = st.f_blocks * st.f_frsize;
>>> +fs->free_bytes = st.f_bfree * st.f_frsize;
>>> +fs->avail_bytes = st.f_bavail * st.f_frsize;
>>>
>>>  fs->has_total_bytes = true;
>>> -fs->has_used_bytes = true;
>>> +fs->has_free_bytes = true;
>>> +fs->has_avail_bytes = true;
>>>  }
>>>
>>>  g_free(devpath);
>>> diff --git a/qga/qapi-schema.json b/qga/qapi-schema.json
>>> index b8efe31897..1cce3c1df5 100644
>>> --- a/qga/qapi-schema.json
>>> +++ b/qga/qapi-schema.json
>>> @@ -1030,9 +1030,12 @@
>>>  #
>>>  # @type: file system type string
>>>  #
>>> -# @used-bytes: file system used bytes (since 3.0)
>>> +# @total-bytes: total file system size in bytes (since 8.3)
>>>  #
>>> -# @total-bytes: non-root file system total bytes (since 3.0)
>>> +# @free-bytes: amount of free space in file system in bytes (since 8.3)
>>>
>>>
>>> I don't agree with this as it breaks backward compatibility. If we want
>>> to get
>>> these changes we should release a new version with both old and new fields
>>> and mark old as deprecated to get a time for everyone who uses this
>>> API updates its solutions.
>>>
>>> A similar thing was with replacing the 'blacklist' command line.
>>> https://gitlab.com/qemu-project/qemu/-/commit/582a098e

Re: [PATCH 1/7] qga/commands-posix: return fsinfo values directly as reported by statvfs

2024-02-27 Thread Andrey Drobyshev



On 2/26/24 20:50, Konstantin Kostiuk wrote:
> 
> Best Regards,
> Konstantin Kostiuk.
> 
> 
> On Mon, Feb 26, 2024 at 7:02 PM Andrey Drobyshev
> mailto:andrey.drobys...@virtuozzo.com>>
> wrote:
> 
> Since the commit 25b5ff1a86 ("qga: add mountpoint usage info to
> GuestFilesystemInfo") we have 2 values reported in guest-get-fsinfo:
> used = (f_blocks - f_bfree), total = (f_blocks - f_bfree + f_bavail).
> These calculations might be obscure for the end user and require one to
> actually get into QGA source to understand how they're obtained. Let's
> just report the values f_blocks, f_bfree, f_bavail (in bytes) from
> statvfs() as they are, letting the user decide how to process them
> further.
> 
> Originally-by: Yuri Pudgorodskiy  <mailto:y...@virtuozzo.com>>
> Signed-off-by: Andrey Drobyshev  <mailto:andrey.drobys...@virtuozzo.com>>
> ---
>  qga/commands-posix.c | 16 +++-
>  qga/qapi-schema.json | 11 +++
>  2 files changed, 14 insertions(+), 13 deletions(-)
> 
> diff --git a/qga/commands-posix.c b/qga/commands-posix.c
> index 26008db497..752ef509d0 100644
> --- a/qga/commands-posix.c
> +++ b/qga/commands-posix.c
> @@ -1554,8 +1554,7 @@ static GuestFilesystemInfo
> *build_guest_fsinfo(strua5a0239ce5ct FsMount *mount,
>                                                 Error **errp)
>  {
>      GuestFilesystemInfo *fs = g_malloc0(sizeof(*fs));
> -    struct statvfs buf;
> -    unsigned long used, nonroot_total, fr_size;
> +    struct statvfs st;
>      char *devpath = g_strdup_printf("/sys/dev/block/%u:%u",
>                                      mount->devmajor, mount->devminor);
> 
> @@ -1563,15 +1562,14 @@ static GuestFilesystemInfo
> *build_guest_fsinfo(struct FsMount *mount,
>      fs->type = g_strdup(mount->devtype);
>      build_guest_fsinfo_for_device(devpath, fs, errp);
> 
> -    if (statvfs(fs->mountpoint, ) == 0) {
> -        fr_size = buf.f_frsize;
> -        used = buf.f_blocks - buf.f_bfree;
> -        nonroot_total = used + buf.f_bavail;
> -        fs->used_bytes = used * fr_size;
> -        fs->total_bytes = nonroot_total * fr_size;
> +    if (statvfs(fs->mountpoint, ) == 0) {
> +        fs->total_bytes = st.f_blocks * st.f_frsize;
> +        fs->free_bytes = st.f_bfree * st.f_frsize;
> +        fs->avail_bytes = st.f_bavail * st.f_frsize;
> 
>          fs->has_total_bytes = true;
> -        fs->has_used_bytes = true;
> +        fs->has_free_bytes = true;
> +        fs->has_avail_bytes = true;
>      }
> 
>      g_free(devpath);
> diff --git a/qga/qapi-schema.json b/qga/qapi-schema.json
> index b8efe31897..1cce3c1df5 100644
> --- a/qga/qapi-schema.json
> +++ b/qga/qapi-schema.json
> @@ -1030,9 +1030,12 @@
>  #
>  # @type: file system type string
>  #
> -# @used-bytes: file system used bytes (since 3.0)
> +# @total-bytes: total file system size in bytes (since 8.3)
>  #
> -# @total-bytes: non-root file system total bytes (since 3.0)
> +# @free-bytes: amount of free space in file system in bytes (since 8.3)
> 
> 
> I don't agree with this as it breaks backward compatibility. If we want
> to get
> these changes we should release a new version with both old and new fields
> and mark old as deprecated to get a time for everyone who uses this
> API updates its solutions.
> 
> A similar thing was with replacing the 'blacklist' command line.
> https://gitlab.com/qemu-project/qemu/-/commit/582a098e6ca00dd42f317dad8affd13e5a20bc42
>  
> <https://gitlab.com/qemu-project/qemu/-/commit/582a098e6ca00dd42f317dad8affd13e5a20bc42>
> Currently, we support both 'blacklist' and 'block-rpcs' command line options
> but the first one wrote a warning.
>

I agree that marking the old values as deprecated does make sense.
Although my original intent with this patch is to make more sense of the
existing names (e.g. total-bytes to indicate true fs size instead of
just non-root fs).  If so, we'd eventually have to replace the original
total-bytes value with the one having new semantics.  Or we could rename
the existing value to smth like "total-bytes-nonroot".  But either way
breaks backward compatibility after all.  How would you suggest to
resolve it?

Andrey

> @Marc-André Lureau <mailto:marcandre.lur...@redhat.com> @Philippe
> Mathieu-Daudé <mailto:phi...@linaro.org> 
> What do you think about this?
>  
> [...]




[PATCH 7/7] qga/commands-posix: qmp_guest_set_user_password: use ga_run_command helper

2024-02-26 Thread Andrey Drobyshev
There's no need to check for the existence of the "chpasswd", "pw"
executables, as the exec() call will do that for us.

Signed-off-by: Andrey Drobyshev 
---
 qga/commands-posix.c | 96 ++--
 1 file changed, 13 insertions(+), 83 deletions(-)

diff --git a/qga/commands-posix.c b/qga/commands-posix.c
index 599f59a323..7f39636417 100644
--- a/qga/commands-posix.c
+++ b/qga/commands-posix.c
@@ -2140,14 +2140,8 @@ void qmp_guest_set_user_password(const char *username,
  Error **errp)
 {
 Error *local_err = NULL;
-char *passwd_path = NULL;
-pid_t pid;
-int status;
-int datafd[2] = { -1, -1 };
-char *rawpasswddata = NULL;
+g_autofree char *rawpasswddata = NULL;
 size_t rawpasswdlen;
-char *chpasswddata = NULL;
-size_t chpasswdlen;
 
 rawpasswddata = (char *)qbase64_decode(password, -1, , errp);
 if (!rawpasswddata) {
@@ -2158,95 +2152,31 @@ void qmp_guest_set_user_password(const char *username,
 
 if (strchr(rawpasswddata, '\n')) {
 error_setg(errp, "forbidden characters in raw password");
-goto out;
+return;
 }
 
 if (strchr(username, '\n') ||
 strchr(username, ':')) {
 error_setg(errp, "forbidden characters in username");
-goto out;
+return;
 }
 
 #ifdef __FreeBSD__
-chpasswddata = g_strdup(rawpasswddata);
-passwd_path = g_find_program_in_path("pw");
+g_autofree char *chpasswdata = g_strdup(rawpasswddata);
+const char *crypt_flag = (crypted) ? "-H" : "-h";
+const char *argv[] = {"pw", "usermod", "-n", username,
+  crypt_flag, "0", NULL};
 #else
-chpasswddata = g_strdup_printf("%s:%s\n", username, rawpasswddata);
-passwd_path = g_find_program_in_path("chpasswd");
+g_autofree char *chpasswddata = g_strdup_printf("%s:%s\n", username,
+rawpasswddata);
+const char *crypt_flag = (crypted) ? "-e" : NULL;
+const char *argv[] = {"chpasswd", crypt_flag, NULL};
 #endif
 
-chpasswdlen = strlen(chpasswddata);
-
-if (!passwd_path) {
-error_setg(errp, "cannot find 'passwd' program in PATH");
-goto out;
-}
-
-if (!g_unix_open_pipe(datafd, FD_CLOEXEC, NULL)) {
-error_setg(errp, "cannot create pipe FDs");
-goto out;
-}
-
-pid = fork();
-if (pid == 0) {
-close(datafd[1]);
-/* child */
-setsid();
-dup2(datafd[0], 0);
-reopen_fd_to_null(1);
-reopen_fd_to_null(2);
-
-#ifdef __FreeBSD__
-const char *h_arg;
-h_arg = (crypted) ? "-H" : "-h";
-execl(passwd_path, "pw", "usermod", "-n", username, h_arg, "0", NULL);
-#else
-if (crypted) {
-execl(passwd_path, "chpasswd", "-e", NULL);
-} else {
-execl(passwd_path, "chpasswd", NULL);
-}
-#endif
-_exit(EXIT_FAILURE);
-} else if (pid < 0) {
-error_setg_errno(errp, errno, "failed to create child process");
-goto out;
-}
-close(datafd[0]);
-datafd[0] = -1;
-
-if (qemu_write_full(datafd[1], chpasswddata, chpasswdlen) != chpasswdlen) {
-error_setg_errno(errp, errno, "cannot write new account password");
-goto out;
-}
-close(datafd[1]);
-datafd[1] = -1;
-
-ga_wait_child(pid, , _err);
+ga_run_command(argv, chpasswddata, "set user password", _err);
 if (local_err) {
 error_propagate(errp, local_err);
-goto out;
-}
-
-if (!WIFEXITED(status)) {
-error_setg(errp, "child process has terminated abnormally");
-goto out;
-}
-
-if (WEXITSTATUS(status)) {
-error_setg(errp, "child process has failed to set user password");
-goto out;
-}
-
-out:
-g_free(chpasswddata);
-g_free(rawpasswddata);
-g_free(passwd_path);
-if (datafd[0] != -1) {
-close(datafd[0]);
-}
-if (datafd[1] != -1) {
-close(datafd[1]);
+return;
 }
 }
 #else /* __linux__ || __FreeBSD__ */
-- 
2.39.3




[PATCH 1/7] qga/commands-posix: return fsinfo values directly as reported by statvfs

2024-02-26 Thread Andrey Drobyshev
Since the commit 25b5ff1a86 ("qga: add mountpoint usage info to
GuestFilesystemInfo") we have 2 values reported in guest-get-fsinfo:
used = (f_blocks - f_bfree), total = (f_blocks - f_bfree + f_bavail).
These calculations might be obscure for the end user and require one to
actually get into QGA source to understand how they're obtained. Let's
just report the values f_blocks, f_bfree, f_bavail (in bytes) from
statvfs() as they are, letting the user decide how to process them further.

Originally-by: Yuri Pudgorodskiy 
Signed-off-by: Andrey Drobyshev 
---
 qga/commands-posix.c | 16 +++-
 qga/qapi-schema.json | 11 +++
 2 files changed, 14 insertions(+), 13 deletions(-)

diff --git a/qga/commands-posix.c b/qga/commands-posix.c
index 26008db497..752ef509d0 100644
--- a/qga/commands-posix.c
+++ b/qga/commands-posix.c
@@ -1554,8 +1554,7 @@ static GuestFilesystemInfo *build_guest_fsinfo(struct 
FsMount *mount,
Error **errp)
 {
 GuestFilesystemInfo *fs = g_malloc0(sizeof(*fs));
-struct statvfs buf;
-unsigned long used, nonroot_total, fr_size;
+struct statvfs st;
 char *devpath = g_strdup_printf("/sys/dev/block/%u:%u",
 mount->devmajor, mount->devminor);
 
@@ -1563,15 +1562,14 @@ static GuestFilesystemInfo *build_guest_fsinfo(struct 
FsMount *mount,
 fs->type = g_strdup(mount->devtype);
 build_guest_fsinfo_for_device(devpath, fs, errp);
 
-if (statvfs(fs->mountpoint, ) == 0) {
-fr_size = buf.f_frsize;
-used = buf.f_blocks - buf.f_bfree;
-nonroot_total = used + buf.f_bavail;
-fs->used_bytes = used * fr_size;
-fs->total_bytes = nonroot_total * fr_size;
+if (statvfs(fs->mountpoint, ) == 0) {
+fs->total_bytes = st.f_blocks * st.f_frsize;
+fs->free_bytes = st.f_bfree * st.f_frsize;
+fs->avail_bytes = st.f_bavail * st.f_frsize;
 
 fs->has_total_bytes = true;
-fs->has_used_bytes = true;
+fs->has_free_bytes = true;
+fs->has_avail_bytes = true;
 }
 
 g_free(devpath);
diff --git a/qga/qapi-schema.json b/qga/qapi-schema.json
index b8efe31897..1cce3c1df5 100644
--- a/qga/qapi-schema.json
+++ b/qga/qapi-schema.json
@@ -1030,9 +1030,12 @@
 #
 # @type: file system type string
 #
-# @used-bytes: file system used bytes (since 3.0)
+# @total-bytes: total file system size in bytes (since 8.3)
 #
-# @total-bytes: non-root file system total bytes (since 3.0)
+# @free-bytes: amount of free space in file system in bytes (since 8.3)
+#
+# @avail-bytes: amount of free space in file system for unprivileged
+# users in bytes (since 8.3)
 #
 # @disk: an array of disk hardware information that the volume lies
 # on, which may be empty if the disk type is not supported
@@ -1041,8 +1044,8 @@
 ##
 { 'struct': 'GuestFilesystemInfo',
   'data': {'name': 'str', 'mountpoint': 'str', 'type': 'str',
-   '*used-bytes': 'uint64', '*total-bytes': 'uint64',
-   'disk': ['GuestDiskAddress']} }
+   '*total-bytes': 'uint64', '*free-bytes': 'uint64',
+   '*avail-bytes': 'uint64', 'disk': ['GuestDiskAddress']} }
 
 ##
 # @guest-get-fsinfo:
-- 
2.39.3




[PATCH 3/7] qga/commands-posix: qmp_guest_shutdown: use ga_run_command helper

2024-02-26 Thread Andrey Drobyshev
Also remove the G_GNUC_UNUSED attribute added in the previous commit from
the helper.

Signed-off-by: Andrey Drobyshev 
---
 qga/commands-posix.c | 39 ++-
 1 file changed, 6 insertions(+), 33 deletions(-)

diff --git a/qga/commands-posix.c b/qga/commands-posix.c
index 510b902b1a..4b64716bab 100644
--- a/qga/commands-posix.c
+++ b/qga/commands-posix.c
@@ -103,7 +103,6 @@ static void ga_pipe_read_str(int fd[2], char **str, size_t 
*len)
  * sending string to stdin and taking error message from
  * stdout/err.
  */
-G_GNUC_UNUSED
 static int ga_run_command(const char *argv[], const char *in_str,
   const char *action, Error **errp)
 {
@@ -220,8 +219,6 @@ void qmp_guest_shutdown(const char *mode, Error **errp)
 {
 const char *shutdown_flag;
 Error *local_err = NULL;
-pid_t pid;
-int status;
 
 #ifdef CONFIG_SOLARIS
 const char *powerdown_flag = "-i5";
@@ -250,46 +247,22 @@ void qmp_guest_shutdown(const char *mode, Error **errp)
 return;
 }
 
-pid = fork();
-if (pid == 0) {
-/* child, start the shutdown */
-setsid();
-reopen_fd_to_null(0);
-reopen_fd_to_null(1);
-reopen_fd_to_null(2);
-
+const char *argv[] = {"/sbin/shutdown",
 #ifdef CONFIG_SOLARIS
-execl("/sbin/shutdown", "shutdown", shutdown_flag, "-g0", "-y",
-  "hypervisor initiated shutdown", (char *)NULL);
+  shutdown_flag, "-g0", "-y",
 #elif defined(CONFIG_BSD)
-execl("/sbin/shutdown", "shutdown", shutdown_flag, "+0",
-   "hypervisor initiated shutdown", (char *)NULL);
+  shutdown_flag, "+0",
 #else
-execl("/sbin/shutdown", "shutdown", "-h", shutdown_flag, "+0",
-   "hypervisor initiated shutdown", (char *)NULL);
+  "-h", shutdown_flag, "+0",
 #endif
-_exit(EXIT_FAILURE);
-} else if (pid < 0) {
-error_setg_errno(errp, errno, "failed to create child process");
-return;
-}
+  "hypervisor initiated shutdown", (char *) NULL};
 
-ga_wait_child(pid, , _err);
+ga_run_command(argv, NULL, "shutdown", _err);
 if (local_err) {
 error_propagate(errp, local_err);
 return;
 }
 
-if (!WIFEXITED(status)) {
-error_setg(errp, "child process has terminated abnormally");
-return;
-}
-
-if (WEXITSTATUS(status)) {
-error_setg(errp, "child process has failed to shutdown");
-return;
-}
-
 /* succeeded */
 }
 
-- 
2.39.3




[PATCH 2/7] qga: introduce ga_run_command() helper for guest cmd execution

2024-02-26 Thread Andrey Drobyshev
When executing guest commands in *nix environment, we repeat the same
fork/exec pattern multiple times.  Let's just separate it into a single
helper which would also be able to feed input data into the launched
process' stdin.  This way we can avoid code duplication.

To keep the history more bisectable, let's replace qmp commands
implementations one by one.  Also add G_GNUC_UNUSED attribute to the
helper and remove it in the next commit.

Originally-by: Yuri Pudgorodskiy 
Signed-off-by: Andrey Drobyshev 
---
 qga/commands-posix.c | 140 +++
 1 file changed, 140 insertions(+)

diff --git a/qga/commands-posix.c b/qga/commands-posix.c
index 752ef509d0..510b902b1a 100644
--- a/qga/commands-posix.c
+++ b/qga/commands-posix.c
@@ -76,6 +76,146 @@ static void ga_wait_child(pid_t pid, int *status, Error 
**errp)
 g_assert(rpid == pid);
 }
 
+static void ga_pipe_read_str(int fd[2], char **str, size_t *len)
+{
+ssize_t n;
+char buf[1024];
+close(fd[1]);
+fd[1] = -1;
+while ((n = read(fd[0], buf, sizeof(buf))) != 0) {
+if (n < 0) {
+if (errno == EINTR) {
+continue;
+} else {
+break;
+}
+}
+*str = g_realloc(*str, *len + n);
+memcpy(*str + *len, buf, n);
+*len += n;
+}
+close(fd[0]);
+fd[0] = -1;
+}
+
+/*
+ * Helper to run command with input/output redirection,
+ * sending string to stdin and taking error message from
+ * stdout/err.
+ */
+G_GNUC_UNUSED
+static int ga_run_command(const char *argv[], const char *in_str,
+  const char *action, Error **errp)
+{
+pid_t pid;
+int status;
+int retcode = -1;
+int infd[2] = { -1, -1 };
+int outfd[2] = { -1, -1 };
+char *str = NULL;
+size_t len = 0;
+
+if ((in_str && !g_unix_open_pipe(infd, FD_CLOEXEC, NULL)) ||
+!g_unix_open_pipe(outfd, FD_CLOEXEC, NULL)) {
+error_setg(errp, "cannot create pipe FDs");
+goto out;
+}
+
+pid = fork();
+if (pid == 0) {
+char *cherr = NULL;
+
+setsid();
+
+if (in_str) {
+/* Redirect stdin to infd. */
+close(infd[1]);
+dup2(infd[0], 0);
+close(infd[0]);
+} else {
+reopen_fd_to_null(0);
+}
+
+/* Redirect stdout/stderr to outfd. */
+close(outfd[0]);
+dup2(outfd[1], 1);
+dup2(outfd[1], 2);
+close(outfd[1]);
+
+execvp(argv[0], (char *const *)argv);
+
+/* Write the cause of failed exec to pipe for the parent to read it. */
+cherr = g_strdup_printf("failed to exec '%s'", argv[0]);
+perror(cherr);
+g_free(cherr);
+_exit(EXIT_FAILURE);
+} else if (pid < 0) {
+error_setg_errno(errp, errno, "failed to create child process");
+goto out;
+}
+
+if (in_str) {
+close(infd[0]);
+infd[0] = -1;
+if (qemu_write_full(infd[1], in_str, strlen(in_str)) !=
+strlen(in_str)) {
+error_setg_errno(errp, errno, "%s: cannot write to stdin pipe",
+ action);
+goto out;
+}
+close(infd[1]);
+infd[1] = -1;
+}
+
+ga_pipe_read_str(outfd, , );
+
+ga_wait_child(pid, , errp);
+if (*errp) {
+goto out;
+}
+
+if (!WIFEXITED(status)) {
+if (len) {
+error_setg(errp, "child process has terminated abnormally: %s",
+   str);
+} else {
+error_setg(errp, "child process has terminated abnormally");
+}
+goto out;
+}
+
+retcode = WEXITSTATUS(status);
+
+if (WEXITSTATUS(status)) {
+if (len) {
+error_setg(errp, "child process has failed to %s: %s",
+   action, str);
+} else {
+error_setg(errp, "child process has failed to %s: exit status %d",
+   action, WEXITSTATUS(status));
+}
+goto out;
+}
+
+out:
+g_free(str);
+
+if (infd[0] != -1) {
+close(infd[0]);
+}
+if (infd[1] != -1) {
+close(infd[1]);
+}
+if (outfd[0] != -1) {
+close(outfd[0]);
+}
+if (outfd[1] != -1) {
+close(outfd[1]);
+}
+
+return retcode;
+}
+
 void qmp_guest_shutdown(const char *mode, Error **errp)
 {
 const char *shutdown_flag;
-- 
2.39.3




[PATCH 6/7] qga/commands-posix: use ga_run_command helper when suspending via sysfs

2024-02-26 Thread Andrey Drobyshev
We replace the direct call to open() with a "sh -c 'echo ...'" call, so
that it becomes an executable command.

Signed-off-by: Andrey Drobyshev 
---
 qga/commands-posix.c | 36 
 1 file changed, 4 insertions(+), 32 deletions(-)

diff --git a/qga/commands-posix.c b/qga/commands-posix.c
index 6baaabc29e..599f59a323 100644
--- a/qga/commands-posix.c
+++ b/qga/commands-posix.c
@@ -1917,49 +1917,21 @@ static void linux_sys_state_suspend(SuspendMode mode, 
Error **errp)
 Error *local_err = NULL;
 const char *sysfile_strs[3] = {"disk", "mem", NULL};
 const char *sysfile_str = sysfile_strs[mode];
-pid_t pid;
-int status;
 
 if (!sysfile_str) {
 error_setg(errp, "unknown guest suspend mode");
 return;
 }
 
-pid = fork();
-if (!pid) {
-/* child */
-int fd;
-
-setsid();
-reopen_fd_to_null(0);
-reopen_fd_to_null(1);
-reopen_fd_to_null(2);
-
-fd = open(LINUX_SYS_STATE_FILE, O_WRONLY);
-if (fd < 0) {
-_exit(EXIT_FAILURE);
-}
-
-if (write(fd, sysfile_str, strlen(sysfile_str)) < 0) {
-_exit(EXIT_FAILURE);
-}
-
-_exit(EXIT_SUCCESS);
-} else if (pid < 0) {
-error_setg_errno(errp, errno, "failed to create child process");
-return;
-}
+g_autofree char *echo_cmd = g_strdup_printf(
+"echo %s > " LINUX_SYS_STATE_FILE, sysfile_str);
+const char *argv[] = {"sh", "-c", echo_cmd, NULL};
 
-ga_wait_child(pid, , _err);
+ga_run_command(argv, NULL, "suspend", _err);
 if (local_err) {
 error_propagate(errp, local_err);
 return;
 }
-
-if (WEXITSTATUS(status)) {
-error_setg(errp, "child process has failed to suspend");
-}
-
 }
 
 static void guest_suspend(SuspendMode mode, Error **errp)
-- 
2.39.3




[PATCH 0/7] qga/commands-posix: replace code duplicating commands with a helper

2024-02-26 Thread Andrey Drobyshev
This series simply replaces repeating fork()/exec() pattern with a
separate helper to avoid code duplication.  It's easier to setup and use
than g_spawn_async_with_pipes() (which we'd need since some commands require
input).  While here, also make qmp_guest_get_fsinfo return more
straightforward values.

Andrey Drobyshev (7):
  qga/commands-posix: return fsinfo values directly as reported by
statvfs
  qga: introduce ga_run_command() helper for guest cmd execution
  qga/commands-posix: qmp_guest_shutdown: use ga_run_command helper
  qga/commands-posix: qmp_guest_set_time: use ga_run_command helper
  qga/commands-posix: execute_fsfreeze_hook: use ga_run_command helper
  qga/commands-posix: use ga_run_command helper when suspending via
sysfs
  qga/commands-posix: qmp_guest_set_user_password: use ga_run_command
helper

 qga/commands-posix.c | 402 +++
 qga/qapi-schema.json |  11 +-
 2 files changed, 181 insertions(+), 232 deletions(-)

-- 
2.39.3




[PATCH 5/7] qga/commands-posix: execute_fsfreeze_hook: use ga_run_command helper

2024-02-26 Thread Andrey Drobyshev
There's no need to check for the existence of the hook executable, as the
exec() call will do that for us.

Signed-off-by: Andrey Drobyshev 
---
 qga/commands-posix.c | 35 +++
 1 file changed, 3 insertions(+), 32 deletions(-)

diff --git a/qga/commands-posix.c b/qga/commands-posix.c
index b704006c98..6baaabc29e 100644
--- a/qga/commands-posix.c
+++ b/qga/commands-posix.c
@@ -726,8 +726,6 @@ static const char *fsfreeze_hook_arg_string[] = {
 
 static void execute_fsfreeze_hook(FsfreezeHookArg arg, Error **errp)
 {
-int status;
-pid_t pid;
 const char *hook;
 const char *arg_str = fsfreeze_hook_arg_string[arg];
 Error *local_err = NULL;
@@ -736,42 +734,15 @@ static void execute_fsfreeze_hook(FsfreezeHookArg arg, 
Error **errp)
 if (!hook) {
 return;
 }
-if (access(hook, X_OK) != 0) {
-error_setg_errno(errp, errno, "can't access fsfreeze hook '%s'", hook);
-return;
-}
 
-slog("executing fsfreeze hook with arg '%s'", arg_str);
-pid = fork();
-if (pid == 0) {
-setsid();
-reopen_fd_to_null(0);
-reopen_fd_to_null(1);
-reopen_fd_to_null(2);
-
-execl(hook, hook, arg_str, NULL);
-_exit(EXIT_FAILURE);
-} else if (pid < 0) {
-error_setg_errno(errp, errno, "failed to create child process");
-return;
-}
+const char *argv[] = {hook, arg_str, NULL};
 
-ga_wait_child(pid, , _err);
+slog("executing fsfreeze hook with arg '%s'", arg_str);
+ga_run_command(argv, NULL, "execute fsfreeze hook", _err);
 if (local_err) {
 error_propagate(errp, local_err);
 return;
 }
-
-if (!WIFEXITED(status)) {
-error_setg(errp, "fsfreeze hook has terminated abnormally");
-return;
-}
-
-status = WEXITSTATUS(status);
-if (status) {
-error_setg(errp, "fsfreeze hook has failed with status %d", status);
-return;
-}
 }
 
 /*
-- 
2.39.3




[PATCH 4/7] qga/commands-posix: qmp_guest_set_time: use ga_run_command helper

2024-02-26 Thread Andrey Drobyshev
There's no need to check for the existence of "/sbin/hwclock", the
exec() call will do that for us.

Signed-off-by: Andrey Drobyshev 
---
 qga/commands-posix.c | 43 +++
 1 file changed, 3 insertions(+), 40 deletions(-)

diff --git a/qga/commands-posix.c b/qga/commands-posix.c
index 4b64716bab..b704006c98 100644
--- a/qga/commands-posix.c
+++ b/qga/commands-posix.c
@@ -269,21 +269,9 @@ void qmp_guest_shutdown(const char *mode, Error **errp)
 void qmp_guest_set_time(bool has_time, int64_t time_ns, Error **errp)
 {
 int ret;
-int status;
-pid_t pid;
 Error *local_err = NULL;
 struct timeval tv;
-static const char hwclock_path[] = "/sbin/hwclock";
-static int hwclock_available = -1;
-
-if (hwclock_available < 0) {
-hwclock_available = (access(hwclock_path, X_OK) == 0);
-}
-
-if (!hwclock_available) {
-error_setg(errp, QERR_UNSUPPORTED);
-return;
-}
+const char *argv[] = {"/sbin/hwclock", has_time ? "-w" : "-s", NULL};
 
 /* If user has passed a time, validate and set it. */
 if (has_time) {
@@ -314,37 +302,12 @@ void qmp_guest_set_time(bool has_time, int64_t time_ns, 
Error **errp)
  * just need to synchronize the hardware clock. However, if no time was
  * passed, user is requesting the opposite: set the system time from the
  * hardware clock (RTC). */
-pid = fork();
-if (pid == 0) {
-setsid();
-reopen_fd_to_null(0);
-reopen_fd_to_null(1);
-reopen_fd_to_null(2);
-
-/* Use '/sbin/hwclock -w' to set RTC from the system time,
- * or '/sbin/hwclock -s' to set the system time from RTC. */
-execl(hwclock_path, "hwclock", has_time ? "-w" : "-s", NULL);
-_exit(EXIT_FAILURE);
-} else if (pid < 0) {
-error_setg_errno(errp, errno, "failed to create child process");
-return;
-}
-
-ga_wait_child(pid, , _err);
+ga_run_command(argv, NULL, "set hardware clock to system time",
+   _err);
 if (local_err) {
 error_propagate(errp, local_err);
 return;
 }
-
-if (!WIFEXITED(status)) {
-error_setg(errp, "child process has terminated abnormally");
-return;
-}
-
-if (WEXITSTATUS(status)) {
-error_setg(errp, "hwclock failed to set hardware clock to system 
time");
-return;
-}
 }
 
 typedef enum {
-- 
2.39.3




Re: [PATCH] kvm: emit GUEST_PANICKED event in case of abnormal KVM exit

2024-02-08 Thread Andrey Drobyshev
On 1/25/24 18:36, Andrey Drobyshev wrote:
> On 1/11/24 15:15, Andrey Drobyshev wrote:
>> On 11/1/23 18:13, Denis V. Lunev wrote:
>>> On 11/1/23 16:23, Andrey Drobyshev wrote:
>>>> Currently we emit GUEST_PANICKED event in case kvm_vcpu_ioctl() returns
>>>> KVM_EXIT_SYSTEM_EVENT with the event type KVM_SYSTEM_EVENT_CRASH.  Let's
>>>> extend this scenario and emit GUEST_PANICKED in case of an abnormal KVM
>>>> exit.  That's a natural thing to do since in this case guest is no
>>>> longer operational anyway.
>>>>
>>>> Signed-off-by: Andrey Drobyshev 
>>>> Acked-by: Denis V. Lunev 
>>>> ---
>>>>   accel/kvm/kvm-all.c | 19 +++
>>>>   1 file changed, 15 insertions(+), 4 deletions(-)
>>>>
>>>> diff --git a/accel/kvm/kvm-all.c b/accel/kvm/kvm-all.c
>>>> index e39a810a4e..d74b3f0b0e 100644
>>>> --- a/accel/kvm/kvm-all.c
>>>> +++ b/accel/kvm/kvm-all.c
>>>> @@ -2816,6 +2816,14 @@ static void kvm_eat_signals(CPUState *cpu)
>>>>   } while (sigismember(, SIG_IPI));
>>>>   }
>>>>   +static void kvm_emit_guest_crash(CPUState *cpu)
>>>> +{
>>>> +    kvm_cpu_synchronize_state(cpu);
>>>> +    qemu_mutex_lock_iothread();
>>>> +    qemu_system_guest_panicked(cpu_get_crash_info(cpu));
>>>> +    qemu_mutex_unlock_iothread();
>>>> +}
>>>> +
>>>>   int kvm_cpu_exec(CPUState *cpu)
>>>>   {
>>>>   struct kvm_run *run = cpu->kvm_run;
>>>> @@ -2969,21 +2977,24 @@ int kvm_cpu_exec(CPUState *cpu)
>>>>   ret = EXCP_INTERRUPT;
>>>>   break;
>>>>   case KVM_SYSTEM_EVENT_CRASH:
>>>> -    kvm_cpu_synchronize_state(cpu);
>>>> -    qemu_mutex_lock_iothread();
>>>> -    qemu_system_guest_panicked(cpu_get_crash_info(cpu));
>>>> -    qemu_mutex_unlock_iothread();
>>>> +    kvm_emit_guest_crash(cpu);
>>>>   ret = 0;
>>>>   break;
>>>>   default:
>>>>   DPRINTF("kvm_arch_handle_exit\n");
>>>>   ret = kvm_arch_handle_exit(cpu, run);
>>>> +    if (ret < 0) {
>>>> +    kvm_emit_guest_crash(cpu);
>>>> +    }
>>>>   break;
>>>>   }
>>>>   break;
>>>>   default:
>>>>   DPRINTF("kvm_arch_handle_exit\n");
>>>>   ret = kvm_arch_handle_exit(cpu, run);
>>>> +    if (ret < 0) {
>>>> +    kvm_emit_guest_crash(cpu);
>>>> +    }
>>>>   break;
>>>>   }
>>>>   } while (ret == 0);
>>> This allows to gracefully handle this problem in production
>>> and reset the guest using on_crash action in libvirt.
>>
>> Ping
> 
> Ping

Yet another ping



Re: [PATCH] block: allocate aligned write buffer for 'truncate -m full'

2024-02-08 Thread Andrey Drobyshev
On 1/25/24 18:46, Vladimir Sementsov-Ogievskiy wrote:
> On 11.12.23 13:55, Andrey Drobyshev wrote:
>> In case we're truncating an image opened with O_DIRECT, we might get
>> -EINVAL on write with unaligned buffer.  In particular, when running
>> iotests/298 with '-nocache' we get:
>>
>> qemu-io: Failed to resize underlying file: Could not write zeros for
>> preallocation: Invalid argument
>>
>> Let's just allocate the buffer using qemu_blockalign0() instead.
>>
>> Signed-off-by: Andrey Drobyshev 
> 
> Reviewed-by: Vladimir Sementsov-Ogievskiy 
> 
> I also suggest to use QEMU_AUTO_VFREE (keep my r-b if you do).
> 
>> ---
>>   block/file-posix.c | 4 ++--
>>   1 file changed, 2 insertions(+), 2 deletions(-)
>>
>> diff --git a/block/file-posix.c b/block/file-posix.c
>> index b862406c71..cee8de510b 100644
>> --- a/block/file-posix.c
>> +++ b/block/file-posix.c
>> @@ -2354,7 +2354,7 @@ static int handle_aiocb_truncate(void *opaque)
>>   goto out;
>>   }
>>   -    buf = g_malloc0(65536);
>> +    buf = qemu_blockalign0(aiocb->bs, 65536);
>>     seek_result = lseek(fd, current_length, SEEK_SET);
>>   if (seek_result < 0) {
>> @@ -2413,7 +2413,7 @@ out:
>>   }
>>   }
>>   -    g_free(buf);
>> +    qemu_vfree(buf);
>>   return result;
>>   }
>>   
> 

Yet another ping, just checking if any of the block maintainers is
interested



Re: [PATCH] iotests: don't run tests requiring cached writes in '-nocache' mode

2024-01-26 Thread Andrey Drobyshev
On 1/26/24 12:24, Kevin Wolf wrote:
> Am 11.12.2023 um 14:32 hat Andrey Drobyshev geschrieben:
>> There're tests whose logic implies running without O_DIRECT set,
>> otherwise they fail when running iotests in '-nocache' mode.  For these
>> tests let's add _require_no_o_direct() helper which can be put in the
>> preabmle and which makes sure '-nocache' isn't set.  Use it to skip
>> running the following tests:
>>
>>   * 271: creates files with unaligned sizes, thus producing multiple
>> errors like:
>>
>> qemu-io: can't open device /path/to/t.qcow2.raw: Cannot get 'write'
>> permission without 'resize': Image size is not a multiple of request 
>> alignment
>>
>>   * 308, file-io-error: use fuse exports.  Though fuse does have
>> 'direct-io' mode (see https://docs.kernel.org/filesystems/fuse-io.html)
>> we aren't using it yet, thus getting errors like:
>>
>> qemu-io: can't open device /path/to/t.qcow2.fuse: Could not open
>> '/path/to/t.qcow2.fuse': filesystem does not support O_DIRECT
>>
>> Signed-off-by: Andrey Drobyshev 
> 
> How are you running qemu-iotests to make these tests fail? I tried to
> reproduce, but they just pass for me:
> 
> $ tests/qemu-iotests/check -qcow2 -nocache 271 308 file-io-error
> [...]
> 271 pass   [11:20:50] [11:21:11]   21.1s  (last: 20.4s) 
> 308 pass   [11:21:11] [11:21:14]   3.3s   (last: 3.3s)  
> file-io-error   pass   [11:21:14] [11:21:14]   0.3s   (last: 0.3s)  
> Passed all 3 iotests
> 
> $ tests/qemu-iotests/check -raw -nocache 271 308 file-io-error
> 271 not run[11:21:20] [11:21:21]   ...  not 
> suitable for this image format: raw
> 308 pass   [11:21:21] [11:21:24]   3.8s   (last: 2.8s)  
> file-io-error   pass   [11:21:24] [11:21:25]   0.3s   (last: 0.3s)  
> Not run: 271
> Passed all 2 iotests
> 
> Kevin
> 

As for the test 271, I imagine this might be caused by different request
alignment.  The failure occurs in block.c, bdrv_node_refresh_perm().  If
I print the alignment out explicitly, I get:

qemu-io: can't open device /path/to/t.qcow2.raw: Cannot get 'write'
permission without 'resize': Image size is not a multiple of request
alignment: 4096

For the record, I'm running tests on ext4.

I'm not sure about the fuse tests though.  Could it also have smth to do
with the underlying fs?

Andrey



Re: [PATCH] block: allocate aligned write buffer for 'truncate -m full'

2024-01-25 Thread Andrey Drobyshev
On 1/11/24 14:53, Andrey Drobyshev wrote:
> On 12/11/23 13:27, Denis V. Lunev wrote:
>> On 12/11/23 11:55, Andrey Drobyshev wrote:
>>> In case we're truncating an image opened with O_DIRECT, we might get
>>> -EINVAL on write with unaligned buffer.  In particular, when running
>>> iotests/298 with '-nocache' we get:
>>>
>>> qemu-io: Failed to resize underlying file: Could not write zeros for
>>> preallocation: Invalid argument
>>>
>>> Let's just allocate the buffer using qemu_blockalign0() instead.
>>>
>>> Signed-off-by: Andrey Drobyshev 
>>> ---
>>>   block/file-posix.c | 4 ++--
>>>   1 file changed, 2 insertions(+), 2 deletions(-)
>>>
>>> diff --git a/block/file-posix.c b/block/file-posix.c
>>> index b862406c71..cee8de510b 100644
>>> --- a/block/file-posix.c
>>> +++ b/block/file-posix.c
>>> @@ -2354,7 +2354,7 @@ static int handle_aiocb_truncate(void *opaque)
>>>   goto out;
>>>   }
>>>   -    buf = g_malloc0(65536);
>>> +    buf = qemu_blockalign0(aiocb->bs, 65536);
>>>     seek_result = lseek(fd, current_length, SEEK_SET);
>>>   if (seek_result < 0) {
>>> @@ -2413,7 +2413,7 @@ out:
>>>   }
>>>   }
>>>   -    g_free(buf);
>>> +    qemu_vfree(buf);
>>>   return result;
>>>   }
>>>   
>> Reviewed-by: Denis V. Lunev 
> 
> Ping

Ping



Re: [PATCH] kvm: emit GUEST_PANICKED event in case of abnormal KVM exit

2024-01-25 Thread Andrey Drobyshev
On 1/11/24 15:15, Andrey Drobyshev wrote:
> On 11/1/23 18:13, Denis V. Lunev wrote:
>> On 11/1/23 16:23, Andrey Drobyshev wrote:
>>> Currently we emit GUEST_PANICKED event in case kvm_vcpu_ioctl() returns
>>> KVM_EXIT_SYSTEM_EVENT with the event type KVM_SYSTEM_EVENT_CRASH.  Let's
>>> extend this scenario and emit GUEST_PANICKED in case of an abnormal KVM
>>> exit.  That's a natural thing to do since in this case guest is no
>>> longer operational anyway.
>>>
>>> Signed-off-by: Andrey Drobyshev 
>>> Acked-by: Denis V. Lunev 
>>> ---
>>>   accel/kvm/kvm-all.c | 19 +++
>>>   1 file changed, 15 insertions(+), 4 deletions(-)
>>>
>>> diff --git a/accel/kvm/kvm-all.c b/accel/kvm/kvm-all.c
>>> index e39a810a4e..d74b3f0b0e 100644
>>> --- a/accel/kvm/kvm-all.c
>>> +++ b/accel/kvm/kvm-all.c
>>> @@ -2816,6 +2816,14 @@ static void kvm_eat_signals(CPUState *cpu)
>>>   } while (sigismember(, SIG_IPI));
>>>   }
>>>   +static void kvm_emit_guest_crash(CPUState *cpu)
>>> +{
>>> +    kvm_cpu_synchronize_state(cpu);
>>> +    qemu_mutex_lock_iothread();
>>> +    qemu_system_guest_panicked(cpu_get_crash_info(cpu));
>>> +    qemu_mutex_unlock_iothread();
>>> +}
>>> +
>>>   int kvm_cpu_exec(CPUState *cpu)
>>>   {
>>>   struct kvm_run *run = cpu->kvm_run;
>>> @@ -2969,21 +2977,24 @@ int kvm_cpu_exec(CPUState *cpu)
>>>   ret = EXCP_INTERRUPT;
>>>   break;
>>>   case KVM_SYSTEM_EVENT_CRASH:
>>> -    kvm_cpu_synchronize_state(cpu);
>>> -    qemu_mutex_lock_iothread();
>>> -    qemu_system_guest_panicked(cpu_get_crash_info(cpu));
>>> -    qemu_mutex_unlock_iothread();
>>> +    kvm_emit_guest_crash(cpu);
>>>   ret = 0;
>>>   break;
>>>   default:
>>>   DPRINTF("kvm_arch_handle_exit\n");
>>>   ret = kvm_arch_handle_exit(cpu, run);
>>> +    if (ret < 0) {
>>> +    kvm_emit_guest_crash(cpu);
>>> +    }
>>>   break;
>>>   }
>>>   break;
>>>   default:
>>>   DPRINTF("kvm_arch_handle_exit\n");
>>>   ret = kvm_arch_handle_exit(cpu, run);
>>> +    if (ret < 0) {
>>> +    kvm_emit_guest_crash(cpu);
>>> +    }
>>>   break;
>>>   }
>>>   } while (ret == 0);
>> This allows to gracefully handle this problem in production
>> and reset the guest using on_crash action in libvirt.
> 
> Ping

Ping



Re: [PATCH] iotests: don't run tests requiring cached writes in '-nocache' mode

2024-01-25 Thread Andrey Drobyshev
On 1/11/24 14:53, Andrey Drobyshev wrote:
> On 12/11/23 15:32, Andrey Drobyshev wrote:
>> There're tests whose logic implies running without O_DIRECT set,
>> otherwise they fail when running iotests in '-nocache' mode.  For these
>> tests let's add _require_no_o_direct() helper which can be put in the
>> preabmle and which makes sure '-nocache' isn't set.  Use it to skip
>> running the following tests:
>>
>>   * 271: creates files with unaligned sizes, thus producing multiple
>> errors like:
>>
>> qemu-io: can't open device /path/to/t.qcow2.raw: Cannot get 'write'
>> permission without 'resize': Image size is not a multiple of request 
>> alignment
>>
>>   * 308, file-io-error: use fuse exports.  Though fuse does have
>> 'direct-io' mode (see https://docs.kernel.org/filesystems/fuse-io.html)
>> we aren't using it yet, thus getting errors like:
>>
>> qemu-io: can't open device /path/to/t.qcow2.fuse: Could not open
>> '/path/to/t.qcow2.fuse': filesystem does not support O_DIRECT
>>
>> Signed-off-by: Andrey Drobyshev 
>> ---
>>  tests/qemu-iotests/271 | 1 +
>>  tests/qemu-iotests/308 | 2 ++
>>  tests/qemu-iotests/common.rc   | 7 +++
>>  tests/qemu-iotests/tests/file-io-error | 1 +
>>  4 files changed, 11 insertions(+)
>>
>> diff --git a/tests/qemu-iotests/271 b/tests/qemu-iotests/271
>> index 59a6fafa2f..1424b6954d 100755
>> --- a/tests/qemu-iotests/271
>> +++ b/tests/qemu-iotests/271
>> @@ -44,6 +44,7 @@ _supported_fmt qcow2
>>  _supported_proto file nfs
>>  _supported_os Linux
>>  _unsupported_imgopts extended_l2 compat=0.10 cluster_size data_file 
>> refcount_bits=1[^0-9]
>> +_require_no_o_direct
>>  
>>  l2_offset=$((0x4))
>>  
>> diff --git a/tests/qemu-iotests/308 b/tests/qemu-iotests/308
>> index de12b2b1b9..535455e5b1 100755
>> --- a/tests/qemu-iotests/308
>> +++ b/tests/qemu-iotests/308
>> @@ -52,6 +52,8 @@ _unsupported_fmt vpc
>>  _supported_proto file # We create the FUSE export manually
>>  _supported_os Linux # We need /dev/urandom
>>  
>> +_require_no_o_direct
>> +
>>  # $1: Export ID
>>  # $2: Options (beyond the node-name and ID)
>>  # $3: Expected return value (defaults to 'return')
>> diff --git a/tests/qemu-iotests/common.rc b/tests/qemu-iotests/common.rc
>> index 95c12577dd..f61eae73b4 100644
>> --- a/tests/qemu-iotests/common.rc
>> +++ b/tests/qemu-iotests/common.rc
>> @@ -857,6 +857,13 @@ _check_o_direct()
>>  [[ "$out" != *"O_DIRECT"* ]]
>>  }
>>  
>> +_require_no_o_direct()
>> +{
>> +if [ $CACHEMODE == "none" ] || [ $CACHEMODE == "directsync" ]; then
>> +_notrun "not suitable for cache mode: $CACHEMODE (implies O_DIRECT)"
>> +fi
>> +}
>> +
>>  _require_o_direct()
>>  {
>>  if ! _check_o_direct; then
>> diff --git a/tests/qemu-iotests/tests/file-io-error 
>> b/tests/qemu-iotests/tests/file-io-error
>> index 88ee5f670c..2b8dc7f009 100755
>> --- a/tests/qemu-iotests/tests/file-io-error
>> +++ b/tests/qemu-iotests/tests/file-io-error
>> @@ -40,6 +40,7 @@ trap "_cleanup; exit \$status" 0 1 2 3 15
>>  # Format-agnostic (we do not use any), but we do test the file protocol
>>  _supported_proto file
>>  _require_drivers blkdebug null-co
>> +_require_no_o_direct
>>  
>>  if [ "$IMGOPTSSYNTAX" = "true" ]; then
>>  # We need `$QEMU_IO -f file` to work; IMGOPTSSYNTAX uses --image-opts,
> 
> Ping

Ping



Re: [PATCH] iotests/277: Use iotests.sock_dir for socket creation

2024-01-25 Thread Andrey Drobyshev
On 1/24/24 19:59, Denis V. Lunev wrote:
> On 1/24/24 18:43, Eric Blake wrote:
>> On Wed, Jan 24, 2024 at 06:22:57PM +0200, Andrey Drobyshev wrote:
>>> If socket path is too long (longer than 108 bytes), socket can't be
>>> opened.  This might lead to failure when test dir path is long enough.
>>> Make sure socket is created in iotests.sock_dir to avoid such a case.
>>>
>>> This commit basically aligns iotests/277 with the rest of iotests.
>>>
>>> Signed-off-by: Andrey Drobyshev 
>>> ---
>>>   tests/qemu-iotests/277 | 3 ++-
>>>   1 file changed, 2 insertions(+), 1 deletion(-)
>> Reviewed-by: Eric Blake 
>>
>>> diff --git a/tests/qemu-iotests/277 b/tests/qemu-iotests/277
>>> index 24833e7eb6..4224202ac2 100755
>>> --- a/tests/qemu-iotests/277
>>> +++ b/tests/qemu-iotests/277
>>> @@ -27,7 +27,8 @@ from iotests import file_path, log
>>>   iotests.script_initialize()
>>>     -nbd_sock, conf_file = file_path('nbd-sock',
>>> 'nbd-fault-injector.conf')
>>> +conf_file = file_path('nbd-fault-injector.conf')
>>> +nbd_sock = file_path('nbd-sock', base_dir=iotests.sock_dir)
>>>       def make_conf_file(event):
>>> -- 
>>> 2.39.3
>>>
> I would say that potentially the same code is present
> in 264, it is :
> disk_a, disk_b, nbd_sock = file_path('disk_a', 'disk_b', 'nbd-sock')
> nbd_uri = 'nbd+unix:///?socket=' + nbd_sock

Thanks, have just sent a separate fix for iotests/264.  Haven't found
any others.

Andrey



[PATCH] iotests/264: Use iotests.sock_dir for socket creation

2024-01-25 Thread Andrey Drobyshev
If socket path is too long (longer than 108 bytes), socket can't be
opened.  This might lead to failure when test dir path is long enough.
Make sure socket is created in iotests.sock_dir to avoid such a case.

This commit basically aligns iotests/264 with the rest of iotests.

Signed-off-by: Andrey Drobyshev 
---
 tests/qemu-iotests/264 | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/tests/qemu-iotests/264 b/tests/qemu-iotests/264
index c532ccd809..c6ba2754e2 100755
--- a/tests/qemu-iotests/264
+++ b/tests/qemu-iotests/264
@@ -25,7 +25,8 @@ import os
 import iotests
 from iotests import qemu_img_create, file_path, qemu_nbd_popen
 
-disk_a, disk_b, nbd_sock = file_path('disk_a', 'disk_b', 'nbd-sock')
+disk_a, disk_b = file_path('disk_a', 'disk_b')
+nbd_sock = file_path('nbd-sock', base_dir=iotests.sock_dir)
 nbd_uri = 'nbd+unix:///?socket=' + nbd_sock
 wait_limit = 3.0
 wait_step = 0.2
-- 
2.39.3




[PATCH] iotests/277: Use iotests.sock_dir for socket creation

2024-01-24 Thread Andrey Drobyshev
If socket path is too long (longer than 108 bytes), socket can't be
opened.  This might lead to failure when test dir path is long enough.
Make sure socket is created in iotests.sock_dir to avoid such a case.

This commit basically aligns iotests/277 with the rest of iotests.

Signed-off-by: Andrey Drobyshev 
---
 tests/qemu-iotests/277 | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/tests/qemu-iotests/277 b/tests/qemu-iotests/277
index 24833e7eb6..4224202ac2 100755
--- a/tests/qemu-iotests/277
+++ b/tests/qemu-iotests/277
@@ -27,7 +27,8 @@ from iotests import file_path, log
 iotests.script_initialize()
 
 
-nbd_sock, conf_file = file_path('nbd-sock', 'nbd-fault-injector.conf')
+conf_file = file_path('nbd-fault-injector.conf')
+nbd_sock = file_path('nbd-sock', base_dir=iotests.sock_dir)
 
 
 def make_conf_file(event):
-- 
2.39.3




Re: [PATCH] kvm: emit GUEST_PANICKED event in case of abnormal KVM exit

2024-01-11 Thread Andrey Drobyshev
On 11/1/23 18:13, Denis V. Lunev wrote:
> On 11/1/23 16:23, Andrey Drobyshev wrote:
>> Currently we emit GUEST_PANICKED event in case kvm_vcpu_ioctl() returns
>> KVM_EXIT_SYSTEM_EVENT with the event type KVM_SYSTEM_EVENT_CRASH.  Let's
>> extend this scenario and emit GUEST_PANICKED in case of an abnormal KVM
>> exit.  That's a natural thing to do since in this case guest is no
>> longer operational anyway.
>>
>> Signed-off-by: Andrey Drobyshev 
>> Acked-by: Denis V. Lunev 
>> ---
>>   accel/kvm/kvm-all.c | 19 +++
>>   1 file changed, 15 insertions(+), 4 deletions(-)
>>
>> diff --git a/accel/kvm/kvm-all.c b/accel/kvm/kvm-all.c
>> index e39a810a4e..d74b3f0b0e 100644
>> --- a/accel/kvm/kvm-all.c
>> +++ b/accel/kvm/kvm-all.c
>> @@ -2816,6 +2816,14 @@ static void kvm_eat_signals(CPUState *cpu)
>>   } while (sigismember(, SIG_IPI));
>>   }
>>   +static void kvm_emit_guest_crash(CPUState *cpu)
>> +{
>> +    kvm_cpu_synchronize_state(cpu);
>> +    qemu_mutex_lock_iothread();
>> +    qemu_system_guest_panicked(cpu_get_crash_info(cpu));
>> +    qemu_mutex_unlock_iothread();
>> +}
>> +
>>   int kvm_cpu_exec(CPUState *cpu)
>>   {
>>   struct kvm_run *run = cpu->kvm_run;
>> @@ -2969,21 +2977,24 @@ int kvm_cpu_exec(CPUState *cpu)
>>   ret = EXCP_INTERRUPT;
>>   break;
>>   case KVM_SYSTEM_EVENT_CRASH:
>> -    kvm_cpu_synchronize_state(cpu);
>> -    qemu_mutex_lock_iothread();
>> -    qemu_system_guest_panicked(cpu_get_crash_info(cpu));
>> -    qemu_mutex_unlock_iothread();
>> +    kvm_emit_guest_crash(cpu);
>>   ret = 0;
>>   break;
>>   default:
>>   DPRINTF("kvm_arch_handle_exit\n");
>>   ret = kvm_arch_handle_exit(cpu, run);
>> +    if (ret < 0) {
>> +    kvm_emit_guest_crash(cpu);
>> +    }
>>   break;
>>   }
>>   break;
>>   default:
>>   DPRINTF("kvm_arch_handle_exit\n");
>>   ret = kvm_arch_handle_exit(cpu, run);
>> +    if (ret < 0) {
>> +    kvm_emit_guest_crash(cpu);
>> +    }
>>   break;
>>   }
>>   } while (ret == 0);
> This allows to gracefully handle this problem in production
> and reset the guest using on_crash action in libvirt.

Ping




Re: [PATCH] block: allocate aligned write buffer for 'truncate -m full'

2024-01-11 Thread Andrey Drobyshev
On 12/11/23 13:27, Denis V. Lunev wrote:
> On 12/11/23 11:55, Andrey Drobyshev wrote:
>> In case we're truncating an image opened with O_DIRECT, we might get
>> -EINVAL on write with unaligned buffer.  In particular, when running
>> iotests/298 with '-nocache' we get:
>>
>> qemu-io: Failed to resize underlying file: Could not write zeros for
>> preallocation: Invalid argument
>>
>> Let's just allocate the buffer using qemu_blockalign0() instead.
>>
>> Signed-off-by: Andrey Drobyshev 
>> ---
>>   block/file-posix.c | 4 ++--
>>   1 file changed, 2 insertions(+), 2 deletions(-)
>>
>> diff --git a/block/file-posix.c b/block/file-posix.c
>> index b862406c71..cee8de510b 100644
>> --- a/block/file-posix.c
>> +++ b/block/file-posix.c
>> @@ -2354,7 +2354,7 @@ static int handle_aiocb_truncate(void *opaque)
>>   goto out;
>>   }
>>   -    buf = g_malloc0(65536);
>> +    buf = qemu_blockalign0(aiocb->bs, 65536);
>>     seek_result = lseek(fd, current_length, SEEK_SET);
>>   if (seek_result < 0) {
>> @@ -2413,7 +2413,7 @@ out:
>>   }
>>   }
>>   -    g_free(buf);
>> +    qemu_vfree(buf);
>>   return result;
>>   }
>>   
> Reviewed-by: Denis V. Lunev 

Ping



Re: [PATCH] iotests: don't run tests requiring cached writes in '-nocache' mode

2024-01-11 Thread Andrey Drobyshev
On 12/11/23 15:32, Andrey Drobyshev wrote:
> There're tests whose logic implies running without O_DIRECT set,
> otherwise they fail when running iotests in '-nocache' mode.  For these
> tests let's add _require_no_o_direct() helper which can be put in the
> preabmle and which makes sure '-nocache' isn't set.  Use it to skip
> running the following tests:
> 
>   * 271: creates files with unaligned sizes, thus producing multiple
> errors like:
> 
> qemu-io: can't open device /path/to/t.qcow2.raw: Cannot get 'write'
> permission without 'resize': Image size is not a multiple of request alignment
> 
>   * 308, file-io-error: use fuse exports.  Though fuse does have
> 'direct-io' mode (see https://docs.kernel.org/filesystems/fuse-io.html)
> we aren't using it yet, thus getting errors like:
> 
> qemu-io: can't open device /path/to/t.qcow2.fuse: Could not open
> '/path/to/t.qcow2.fuse': filesystem does not support O_DIRECT
> 
> Signed-off-by: Andrey Drobyshev 
> ---
>  tests/qemu-iotests/271 | 1 +
>  tests/qemu-iotests/308 | 2 ++
>  tests/qemu-iotests/common.rc   | 7 +++
>  tests/qemu-iotests/tests/file-io-error | 1 +
>  4 files changed, 11 insertions(+)
> 
> diff --git a/tests/qemu-iotests/271 b/tests/qemu-iotests/271
> index 59a6fafa2f..1424b6954d 100755
> --- a/tests/qemu-iotests/271
> +++ b/tests/qemu-iotests/271
> @@ -44,6 +44,7 @@ _supported_fmt qcow2
>  _supported_proto file nfs
>  _supported_os Linux
>  _unsupported_imgopts extended_l2 compat=0.10 cluster_size data_file 
> refcount_bits=1[^0-9]
> +_require_no_o_direct
>  
>  l2_offset=$((0x4))
>  
> diff --git a/tests/qemu-iotests/308 b/tests/qemu-iotests/308
> index de12b2b1b9..535455e5b1 100755
> --- a/tests/qemu-iotests/308
> +++ b/tests/qemu-iotests/308
> @@ -52,6 +52,8 @@ _unsupported_fmt vpc
>  _supported_proto file # We create the FUSE export manually
>  _supported_os Linux # We need /dev/urandom
>  
> +_require_no_o_direct
> +
>  # $1: Export ID
>  # $2: Options (beyond the node-name and ID)
>  # $3: Expected return value (defaults to 'return')
> diff --git a/tests/qemu-iotests/common.rc b/tests/qemu-iotests/common.rc
> index 95c12577dd..f61eae73b4 100644
> --- a/tests/qemu-iotests/common.rc
> +++ b/tests/qemu-iotests/common.rc
> @@ -857,6 +857,13 @@ _check_o_direct()
>  [[ "$out" != *"O_DIRECT"* ]]
>  }
>  
> +_require_no_o_direct()
> +{
> +if [ $CACHEMODE == "none" ] || [ $CACHEMODE == "directsync" ]; then
> +_notrun "not suitable for cache mode: $CACHEMODE (implies O_DIRECT)"
> +fi
> +}
> +
>  _require_o_direct()
>  {
>  if ! _check_o_direct; then
> diff --git a/tests/qemu-iotests/tests/file-io-error 
> b/tests/qemu-iotests/tests/file-io-error
> index 88ee5f670c..2b8dc7f009 100755
> --- a/tests/qemu-iotests/tests/file-io-error
> +++ b/tests/qemu-iotests/tests/file-io-error
> @@ -40,6 +40,7 @@ trap "_cleanup; exit \$status" 0 1 2 3 15
>  # Format-agnostic (we do not use any), but we do test the file protocol
>  _supported_proto file
>  _require_drivers blkdebug null-co
> +_require_no_o_direct
>  
>  if [ "$IMGOPTSSYNTAX" = "true" ]; then
>  # We need `$QEMU_IO -f file` to work; IMGOPTSSYNTAX uses --image-opts,

Ping



[PATCH] iotests: don't run tests requiring cached writes in '-nocache' mode

2023-12-11 Thread Andrey Drobyshev
There're tests whose logic implies running without O_DIRECT set,
otherwise they fail when running iotests in '-nocache' mode.  For these
tests let's add _require_no_o_direct() helper which can be put in the
preabmle and which makes sure '-nocache' isn't set.  Use it to skip
running the following tests:

  * 271: creates files with unaligned sizes, thus producing multiple
errors like:

qemu-io: can't open device /path/to/t.qcow2.raw: Cannot get 'write'
permission without 'resize': Image size is not a multiple of request alignment

  * 308, file-io-error: use fuse exports.  Though fuse does have
'direct-io' mode (see https://docs.kernel.org/filesystems/fuse-io.html)
we aren't using it yet, thus getting errors like:

qemu-io: can't open device /path/to/t.qcow2.fuse: Could not open
'/path/to/t.qcow2.fuse': filesystem does not support O_DIRECT

Signed-off-by: Andrey Drobyshev 
---
 tests/qemu-iotests/271 | 1 +
 tests/qemu-iotests/308 | 2 ++
 tests/qemu-iotests/common.rc   | 7 +++
 tests/qemu-iotests/tests/file-io-error | 1 +
 4 files changed, 11 insertions(+)

diff --git a/tests/qemu-iotests/271 b/tests/qemu-iotests/271
index 59a6fafa2f..1424b6954d 100755
--- a/tests/qemu-iotests/271
+++ b/tests/qemu-iotests/271
@@ -44,6 +44,7 @@ _supported_fmt qcow2
 _supported_proto file nfs
 _supported_os Linux
 _unsupported_imgopts extended_l2 compat=0.10 cluster_size data_file 
refcount_bits=1[^0-9]
+_require_no_o_direct
 
 l2_offset=$((0x4))
 
diff --git a/tests/qemu-iotests/308 b/tests/qemu-iotests/308
index de12b2b1b9..535455e5b1 100755
--- a/tests/qemu-iotests/308
+++ b/tests/qemu-iotests/308
@@ -52,6 +52,8 @@ _unsupported_fmt vpc
 _supported_proto file # We create the FUSE export manually
 _supported_os Linux # We need /dev/urandom
 
+_require_no_o_direct
+
 # $1: Export ID
 # $2: Options (beyond the node-name and ID)
 # $3: Expected return value (defaults to 'return')
diff --git a/tests/qemu-iotests/common.rc b/tests/qemu-iotests/common.rc
index 95c12577dd..f61eae73b4 100644
--- a/tests/qemu-iotests/common.rc
+++ b/tests/qemu-iotests/common.rc
@@ -857,6 +857,13 @@ _check_o_direct()
 [[ "$out" != *"O_DIRECT"* ]]
 }
 
+_require_no_o_direct()
+{
+if [ $CACHEMODE == "none" ] || [ $CACHEMODE == "directsync" ]; then
+_notrun "not suitable for cache mode: $CACHEMODE (implies O_DIRECT)"
+fi
+}
+
 _require_o_direct()
 {
 if ! _check_o_direct; then
diff --git a/tests/qemu-iotests/tests/file-io-error 
b/tests/qemu-iotests/tests/file-io-error
index 88ee5f670c..2b8dc7f009 100755
--- a/tests/qemu-iotests/tests/file-io-error
+++ b/tests/qemu-iotests/tests/file-io-error
@@ -40,6 +40,7 @@ trap "_cleanup; exit \$status" 0 1 2 3 15
 # Format-agnostic (we do not use any), but we do test the file protocol
 _supported_proto file
 _require_drivers blkdebug null-co
+_require_no_o_direct
 
 if [ "$IMGOPTSSYNTAX" = "true" ]; then
 # We need `$QEMU_IO -f file` to work; IMGOPTSSYNTAX uses --image-opts,
-- 
2.39.3




[PATCH] block: allocate aligned write buffer for 'truncate -m full'

2023-12-11 Thread Andrey Drobyshev
In case we're truncating an image opened with O_DIRECT, we might get
-EINVAL on write with unaligned buffer.  In particular, when running
iotests/298 with '-nocache' we get:

qemu-io: Failed to resize underlying file: Could not write zeros for
preallocation: Invalid argument

Let's just allocate the buffer using qemu_blockalign0() instead.

Signed-off-by: Andrey Drobyshev 
---
 block/file-posix.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/block/file-posix.c b/block/file-posix.c
index b862406c71..cee8de510b 100644
--- a/block/file-posix.c
+++ b/block/file-posix.c
@@ -2354,7 +2354,7 @@ static int handle_aiocb_truncate(void *opaque)
 goto out;
 }
 
-buf = g_malloc0(65536);
+buf = qemu_blockalign0(aiocb->bs, 65536);
 
 seek_result = lseek(fd, current_length, SEEK_SET);
 if (seek_result < 0) {
@@ -2413,7 +2413,7 @@ out:
 }
 }
 
-g_free(buf);
+qemu_vfree(buf);
 return result;
 }
 
-- 
2.39.3




Re: [PATCH] Revert "test/qga: use G_TEST_DIR to locate os-release test file"

2023-12-04 Thread Andrey Drobyshev
On 12/4/23 19:09, Marc-André Lureau wrote:
> Hi
> 
> On Mon, Dec 4, 2023 at 9:01 PM Andrey Drobyshev
>  wrote:
>>
>> On 12/4/23 18:51, Marc-André Lureau wrote:
>>> Hi
>>>
>>> On Mon, Dec 4, 2023 at 8:33 PM Andrey Drobyshev
>>>  wrote:
>>>>
>>>> Since the commit a85d09269b QGA_OS_RELEASE variable points to the path
>>>> relative to the build dir.  Then on qemu-ga startup this path can't be
>>>> found as qemu-ga cwd is somewhere else, which leads to the test failure:
>>>>
>>>>   # ./tests/unit/test-qga -p /qga/guest-get-osinfo
>>>>   # random seed: R02S3a90c22d77ff1070fbd844f4959cf4a4
>>>>   # Start of qga tests
>>>>   **
>>>>   ERROR:../tests/unit/test-qga.c:906:test_qga_guest_get_osinfo: 'str' 
>>>> should not be NULL
>>>>   Bail out! ERROR:../tests/unit/test-qga.c:906:test_qga_guest_get_osinfo: 
>>>> 'str' should not be NULL
>>>>
>>>> Let's obtain the absolute path again.
>>>
>>> Can you detail how the build and the test is done?
>>>
>>
>> Simple as:
>>
>>> ./configure --cc=gcc --target-list=x86_64-softmmu --enable-guest-agent && 
>>> make -j16
>>> cd build; tests/unit/test-qga -p /qga/guest-get-osinfo
>>
>>
>>> If I recall correctly, this change was done in order to move qga to a
>>> subproject(), but isn't strictly required at this point. Although I
>>> believe it is more correct to lookup test data relative to
>>> G_TEST_DIST.
>>>
>>
>> Then we'd have to change cwd of qemu-ga at startup to ensure relative
>> paths work.  Right now (with the initial change) it appears broken.
> 
> By reverting the patch, it is _still_ broken if you run the test
> manually from a different directory (say from tests/unit for example)
>
> With G_TEST_DIST, and proper testing environment, it works from any directory.
> 

No, it seems to be failing as well, only earlier.  Before the revert:
> cd build/tests/unit; ./test-qga 
> # random seed: R02S450ef942c699b5af6dff48f9c5b73b33
> **
> ERROR:../tests/unit/test-qga.c:79:fixture_setup: assertion failed (error == 
> NULL): Failed to execute child process “$SRC/build/tests/unit/qga/qemu-ga” 
> (No such file or directory) (g-exec-error-quark, 8)
> Bail out! ERROR:../tests/unit/test-qga.c:79:fixture_setup: assertion failed 
> (error == NULL): Failed to execute child process 
> “$SRC/build/tests/unit/qga/qemu-ga” (No such file or directory) 
> (g-exec-error-quark, 8)

But maybe my testing environment isn't proper?

> Tests are not meant to be run manually, you should run them through
> the test runner: meson test -v test-qga
> 

That's a good point, but I just found it suspicious that this is
literally the *only* case of the *only* unit test which fails (when run
directly from ./build).  Could we fix the direct execution as well then?

Btw test runner also cannot be run from just any directory, otherwise it
complains:
> meson test -v test-qga 
> 
> ERROR: No such build data file as 
> '$SRC/build/tests/unit/meson-private/build.dat'.


>>
>>>>
>>>> This reverts commit a85d09269bb1a7071d3ce0f2957e3ca9dba7c047.
>>>>
>>>> Signed-off-by: Andrey Drobyshev 
>>>> ---
>>>>  tests/unit/test-qga.c | 6 --
>>>>  1 file changed, 4 insertions(+), 2 deletions(-)
>>>>
>>>> diff --git a/tests/unit/test-qga.c b/tests/unit/test-qga.c
>>>> index 671e83cb86..47cf5e30ec 100644
>>>> --- a/tests/unit/test-qga.c
>>>> +++ b/tests/unit/test-qga.c
>>>> @@ -1034,10 +1034,12 @@ static void 
>>>> test_qga_guest_get_osinfo(gconstpointer data)
>>>>  g_autoptr(QDict) ret = NULL;
>>>>  char *env[2];
>>>>  QDict *val;
>>>> +g_autofree gchar *cwd = NULL;
>>>>
>>>> +cwd = g_get_current_dir();
>>>>  env[0] = g_strdup_printf(
>>>> -"QGA_OS_RELEASE=%s%c..%cdata%ctest-qga-os-release",
>>>> -g_test_get_dir(G_TEST_DIST), G_DIR_SEPARATOR, G_DIR_SEPARATOR, 
>>>> G_DIR_SEPARATOR);
>>>> +"QGA_OS_RELEASE=%s%ctests%cdata%ctest-qga-os-release",
>>>> +cwd, G_DIR_SEPARATOR, G_DIR_SEPARATOR, G_DIR_SEPARATOR);
>>>>  env[1] = NULL;
>>>>  fixture_setup(, NULL, env);
>>>>
>>>> --
>>>> 2.39.3
>>>>
>>>>
>>>
>>>
>>
> 
> 




Re: [PATCH] Revert "test/qga: use G_TEST_DIR to locate os-release test file"

2023-12-04 Thread Andrey Drobyshev
On 12/4/23 18:51, Marc-André Lureau wrote:
> Hi
> 
> On Mon, Dec 4, 2023 at 8:33 PM Andrey Drobyshev
>  wrote:
>>
>> Since the commit a85d09269b QGA_OS_RELEASE variable points to the path
>> relative to the build dir.  Then on qemu-ga startup this path can't be
>> found as qemu-ga cwd is somewhere else, which leads to the test failure:
>>
>>   # ./tests/unit/test-qga -p /qga/guest-get-osinfo
>>   # random seed: R02S3a90c22d77ff1070fbd844f4959cf4a4
>>   # Start of qga tests
>>   **
>>   ERROR:../tests/unit/test-qga.c:906:test_qga_guest_get_osinfo: 'str' should 
>> not be NULL
>>   Bail out! ERROR:../tests/unit/test-qga.c:906:test_qga_guest_get_osinfo: 
>> 'str' should not be NULL
>>
>> Let's obtain the absolute path again.
> 
> Can you detail how the build and the test is done?
> 

Simple as:

> ./configure --cc=gcc --target-list=x86_64-softmmu --enable-guest-agent && 
> make -j16
> cd build; tests/unit/test-qga -p /qga/guest-get-osinfo


> If I recall correctly, this change was done in order to move qga to a
> subproject(), but isn't strictly required at this point. Although I
> believe it is more correct to lookup test data relative to
> G_TEST_DIST.
> 

Then we'd have to change cwd of qemu-ga at startup to ensure relative
paths work.  Right now (with the initial change) it appears broken.

>>
>> This reverts commit a85d09269bb1a7071d3ce0f2957e3ca9dba7c047.
>>
>> Signed-off-by: Andrey Drobyshev 
>> ---
>>  tests/unit/test-qga.c | 6 --
>>  1 file changed, 4 insertions(+), 2 deletions(-)
>>
>> diff --git a/tests/unit/test-qga.c b/tests/unit/test-qga.c
>> index 671e83cb86..47cf5e30ec 100644
>> --- a/tests/unit/test-qga.c
>> +++ b/tests/unit/test-qga.c
>> @@ -1034,10 +1034,12 @@ static void test_qga_guest_get_osinfo(gconstpointer 
>> data)
>>  g_autoptr(QDict) ret = NULL;
>>  char *env[2];
>>  QDict *val;
>> +g_autofree gchar *cwd = NULL;
>>
>> +cwd = g_get_current_dir();
>>  env[0] = g_strdup_printf(
>> -"QGA_OS_RELEASE=%s%c..%cdata%ctest-qga-os-release",
>> -g_test_get_dir(G_TEST_DIST), G_DIR_SEPARATOR, G_DIR_SEPARATOR, 
>> G_DIR_SEPARATOR);
>> +"QGA_OS_RELEASE=%s%ctests%cdata%ctest-qga-os-release",
>> +cwd, G_DIR_SEPARATOR, G_DIR_SEPARATOR, G_DIR_SEPARATOR);
>>  env[1] = NULL;
>>  fixture_setup(, NULL, env);
>>
>> --
>> 2.39.3
>>
>>
> 
> 




[PATCH] Revert "test/qga: use G_TEST_DIR to locate os-release test file"

2023-12-04 Thread Andrey Drobyshev
Since the commit a85d09269b QGA_OS_RELEASE variable points to the path
relative to the build dir.  Then on qemu-ga startup this path can't be
found as qemu-ga cwd is somewhere else, which leads to the test failure:

  # ./tests/unit/test-qga -p /qga/guest-get-osinfo
  # random seed: R02S3a90c22d77ff1070fbd844f4959cf4a4
  # Start of qga tests
  **
  ERROR:../tests/unit/test-qga.c:906:test_qga_guest_get_osinfo: 'str' should 
not be NULL
  Bail out! ERROR:../tests/unit/test-qga.c:906:test_qga_guest_get_osinfo: 'str' 
should not be NULL

Let's obtain the absolute path again.

This reverts commit a85d09269bb1a7071d3ce0f2957e3ca9dba7c047.

Signed-off-by: Andrey Drobyshev 
---
 tests/unit/test-qga.c | 6 --
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/tests/unit/test-qga.c b/tests/unit/test-qga.c
index 671e83cb86..47cf5e30ec 100644
--- a/tests/unit/test-qga.c
+++ b/tests/unit/test-qga.c
@@ -1034,10 +1034,12 @@ static void test_qga_guest_get_osinfo(gconstpointer 
data)
 g_autoptr(QDict) ret = NULL;
 char *env[2];
 QDict *val;
+g_autofree gchar *cwd = NULL;
 
+cwd = g_get_current_dir();
 env[0] = g_strdup_printf(
-"QGA_OS_RELEASE=%s%c..%cdata%ctest-qga-os-release",
-g_test_get_dir(G_TEST_DIST), G_DIR_SEPARATOR, G_DIR_SEPARATOR, 
G_DIR_SEPARATOR);
+"QGA_OS_RELEASE=%s%ctests%cdata%ctest-qga-os-release",
+cwd, G_DIR_SEPARATOR, G_DIR_SEPARATOR, G_DIR_SEPARATOR);
 env[1] = NULL;
 fixture_setup(, NULL, env);
 
-- 
2.39.3




Re: [PATCH] kvm: emit GUEST_PANICKED event in case of abnormal KVM exit

2023-11-22 Thread Andrey Drobyshev
On 11/1/23 18:13, Denis V. Lunev wrote:
> On 11/1/23 16:23, Andrey Drobyshev wrote:
>> Currently we emit GUEST_PANICKED event in case kvm_vcpu_ioctl() returns
>> KVM_EXIT_SYSTEM_EVENT with the event type KVM_SYSTEM_EVENT_CRASH.  Let's
>> extend this scenario and emit GUEST_PANICKED in case of an abnormal KVM
>> exit.  That's a natural thing to do since in this case guest is no
>> longer operational anyway.
>>
>> Signed-off-by: Andrey Drobyshev 
>> Acked-by: Denis V. Lunev 
>> ---
>>   accel/kvm/kvm-all.c | 19 +++
>>   1 file changed, 15 insertions(+), 4 deletions(-)
>>
>> diff --git a/accel/kvm/kvm-all.c b/accel/kvm/kvm-all.c
>> index e39a810a4e..d74b3f0b0e 100644
>> --- a/accel/kvm/kvm-all.c
>> +++ b/accel/kvm/kvm-all.c
>> @@ -2816,6 +2816,14 @@ static void kvm_eat_signals(CPUState *cpu)
>>   } while (sigismember(, SIG_IPI));
>>   }
>>   +static void kvm_emit_guest_crash(CPUState *cpu)
>> +{
>> +    kvm_cpu_synchronize_state(cpu);
>> +    qemu_mutex_lock_iothread();
>> +    qemu_system_guest_panicked(cpu_get_crash_info(cpu));
>> +    qemu_mutex_unlock_iothread();
>> +}
>> +
>>   int kvm_cpu_exec(CPUState *cpu)
>>   {
>>   struct kvm_run *run = cpu->kvm_run;
>> @@ -2969,21 +2977,24 @@ int kvm_cpu_exec(CPUState *cpu)
>>   ret = EXCP_INTERRUPT;
>>   break;
>>   case KVM_SYSTEM_EVENT_CRASH:
>> -    kvm_cpu_synchronize_state(cpu);
>> -    qemu_mutex_lock_iothread();
>> -    qemu_system_guest_panicked(cpu_get_crash_info(cpu));
>> -    qemu_mutex_unlock_iothread();
>> +    kvm_emit_guest_crash(cpu);
>>   ret = 0;
>>   break;
>>   default:
>>   DPRINTF("kvm_arch_handle_exit\n");
>>   ret = kvm_arch_handle_exit(cpu, run);
>> +    if (ret < 0) {
>> +    kvm_emit_guest_crash(cpu);
>> +    }
>>   break;
>>   }
>>   break;
>>   default:
>>   DPRINTF("kvm_arch_handle_exit\n");
>>   ret = kvm_arch_handle_exit(cpu, run);
>> +    if (ret < 0) {
>> +    kvm_emit_guest_crash(cpu);
>> +    }
>>   break;
>>   }
>>   } while (ret == 0);
> This allows to gracefully handle this problem in production
> and reset the guest using on_crash action in libvirt.

Ping



[PATCH] iotests: fix default MT detection

2023-11-22 Thread Andrey Drobyshev
MT is being detected based on "-M help" output, and we're searching for
the line ending with " (default)".  However, in downstream one of the
MTs marked as deprecated might become the default, in which case this
logic breaks as the line would now end with " (default) (deprecated)".
To fix potential issues here, let's relax that requirement and detect
the mere presence of " (default)" line instead.

Signed-off-by: Andrey Drobyshev 
---
 tests/qemu-iotests/testenv.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tests/qemu-iotests/testenv.py b/tests/qemu-iotests/testenv.py
index e67ebd254b..3ff38f2661 100644
--- a/tests/qemu-iotests/testenv.py
+++ b/tests/qemu-iotests/testenv.py
@@ -40,7 +40,7 @@ def get_default_machine(qemu_prog: str) -> str:
 
 machines = outp.split('\n')
 try:
-default_machine = next(m for m in machines if m.endswith(' (default)'))
+default_machine = next(m for m in machines if ' (default)' in m)
 except StopIteration:
 return ''
 default_machine = default_machine.split(' ', 1)[0]
-- 
2.39.3




Re: [PATCH 4/7] qcow2: make subclusters discardable

2023-11-10 Thread Andrey Drobyshev
On 10/31/23 18:33, Hanna Czenczek wrote:
> (Sorry, opened another reply window, forgot I already had one open...)
> 
> On 20.10.23 23:56, Andrey Drobyshev wrote:
>> This commit makes the discard operation work on the subcluster level
>> rather than cluster level.  It introduces discard_l2_subclusters()
>> function and makes use of it in qcow2 discard implementation, much like
>> it's done with zero_in_l2_slice() / zero_l2_subclusters().  It also
>> changes the qcow2 driver pdiscard_alignment to subcluster_size.  That
>> way subcluster-aligned discards lead to actual fallocate(PUNCH_HOLE)
>> operation and free host disk space.
>>
>> This feature will let us gain additional disk space on guest
>> TRIM/discard requests, especially when using large enough clusters
>> (1M, 2M) with subclusters enabled.
>>
>> Signed-off-by: Andrey Drobyshev 
>> ---
>>   block/qcow2-cluster.c | 100 --
>>   block/qcow2.c |   8 ++--
>>   2 files changed, 101 insertions(+), 7 deletions(-)
>>
>> diff --git a/block/qcow2-cluster.c b/block/qcow2-cluster.c
>> index 7c6fa5524c..cf40f2dc12 100644
>> --- a/block/qcow2-cluster.c
>> +++ b/block/qcow2-cluster.c
>> @@ -2042,6 +2042,74 @@ discard_in_l2_slice(BlockDriverState *bs,
>> uint64_t offset, uint64_t nb_clusters,
>>   return nb_clusters;
>>   }
>>   +static int coroutine_fn GRAPH_RDLOCK
>> +discard_l2_subclusters(BlockDriverState *bs, uint64_t offset,
>> +   uint64_t nb_subclusters,
>> +   enum qcow2_discard_type type,
>> +   bool full_discard,
>> +   SubClusterRangeInfo *pscri)
>> +{
>> +    BDRVQcow2State *s = bs->opaque;
>> +    uint64_t new_l2_bitmap, l2_bitmap_mask;
>> +    int ret, sc = offset_to_sc_index(s, offset);
>> +    SubClusterRangeInfo scri = { 0 };
>> +
>> +    if (!pscri) {
>> +    ret = get_sc_range_info(bs, offset, nb_subclusters, );
>> +    if (ret < 0) {
>> +    goto out;
>> +    }
>> +    } else {
>> +    scri = *pscri;
> 
> Allowing to takes this from the caller sounds dangerous, considering we
> need to track who takes care of freeing scri.l2_slice.
> 
>> +    }
>> +
>> +    l2_bitmap_mask = QCOW_OFLAG_SUB_ALLOC_RANGE(sc, sc +
>> nb_subclusters);
>> +    new_l2_bitmap = scri.l2_bitmap;
>> +    new_l2_bitmap &= ~l2_bitmap_mask;
>> +
>> +    /*
>> + * If there're no allocated subclusters left, we might as well
>> discard
>> + * the entire cluster.  That way we'd also update the refcount
>> table.
>> + */
>> +    if (!(new_l2_bitmap & QCOW_L2_BITMAP_ALL_ALLOC)) {
> 
> What if there are subclusters in the cluster that are marked as zero,
> outside of the discarded range?  It sounds wrong to apply a discard with
> either full_discard set or cleared to them.
> 

Hmmm, then I think before falling to this path we should either:

1. Make sure full_discard=false.  That is directly implied from what you
said in your other mail: discarding with full_discard=false is
equivalent to zeroizing;
2. Check that no subclusters within this cluster are explicitly marked
as zeroes.
3. Check that either 1) or 2) is true (if ((1) || (2))).

For now I like option 3) better.

>> +    return discard_in_l2_slice(bs,
>> +   QEMU_ALIGN_DOWN(offset,
>> s->cluster_size),
>> +   1, type, full_discard);
>> +    }
>> +
>> +    /*
>> + * Full discard means we fall through to the backing file, thus
>> we only
>> + * need to mark the subclusters as deallocated.
> 
> I think it also means we need to clear the zero bits.
>

Yes, that seems right.

> Hanna
> 
> [...]



Re: [PATCH 5/7] qcow2: zero_l2_subclusters: fall through to discard operation when requested

2023-11-10 Thread Andrey Drobyshev
On 11/3/23 17:19, Hanna Czenczek wrote:
> On 20.10.23 23:56, Andrey Drobyshev wrote:
>> When zeroizing subclusters within single cluster, detect usage of the
>> BDRV_REQ_MAY_UNMAP flag and fall through to the subcluster-based discard
>> operation, much like it's done with the cluster-based discards.  That
>> way subcluster-aligned operations "qemu-io -c 'write -z -u ...'" will
>> lead to actual unmap.
> 
> Ever since the introduction of discard-no-unref, I wonder whether that
> is actually an advantage or not.  I can’t think of a reason why it’d be
> advantageous to drop the allocation.

You mean omitting the UNallocation on the discard path?

> 
> On the other hand, zero_in_l2_slice() does it indeed, so consistency is
> a reasonable argument.
> 
>> Signed-off-by: Andrey Drobyshev 
>> ---
>>   block/qcow2-cluster.c | 17 ++---
>>   1 file changed, 14 insertions(+), 3 deletions(-)
>>
>> diff --git a/block/qcow2-cluster.c b/block/qcow2-cluster.c
>> index cf40f2dc12..040251f2c3 100644
>> --- a/block/qcow2-cluster.c
>> +++ b/block/qcow2-cluster.c
>> @@ -2242,7 +2242,7 @@ zero_l2_subclusters(BlockDriverState *bs,
>> uint64_t offset,
>>   unsigned nb_subclusters, int flags)
>>   {
>>   BDRVQcow2State *s = bs->opaque;
>> -    uint64_t new_l2_bitmap;
>> +    uint64_t new_l2_bitmap, l2_bitmap_mask;
>>   int ret, sc = offset_to_sc_index(s, offset);
>>   SubClusterRangeInfo scri = { 0 };
>>   @@ -2251,9 +2251,10 @@ zero_l2_subclusters(BlockDriverState *bs,
>> uint64_t offset,
>>   goto out;
>>   }
>>   +    l2_bitmap_mask = QCOW_OFLAG_SUB_ALLOC_RANGE(sc, sc +
>> nb_subclusters);
> 
> “l2_bitmap_mask” already wasn’t a great name in patch 4 because it
> doesn’t say what mask it is, but in patch 4, there was at least only one
> relevant mask.  Here, we have two (ALLOC_RANGE and ZERO_RANGE), so the
> name should reflect what kind of mask it is.
> 

Noted.

>>   new_l2_bitmap = scri.l2_bitmap;
>> -    new_l2_bitmap |=  QCOW_OFLAG_SUB_ZERO_RANGE(sc, sc +
>> nb_subclusters);
>> -    new_l2_bitmap &= ~QCOW_OFLAG_SUB_ALLOC_RANGE(sc, sc +
>> nb_subclusters);
>> +    new_l2_bitmap |= QCOW_OFLAG_SUB_ZERO_RANGE(sc, sc + nb_subclusters);
>> +    new_l2_bitmap &= ~l2_bitmap_mask;
>>     /*
>>    * If there're no non-zero subclusters left, we might as well
>> zeroize
>> @@ -2266,6 +2267,16 @@ zero_l2_subclusters(BlockDriverState *bs,
>> uint64_t offset,
>>   1, flags);
>>   }
>>   +    /*
>> + * If the request allows discarding subclusters and they're actually
>> + * allocated, we go down the discard path since after the discard
>> + * operation the subclusters are going to be read as zeroes anyway.
>> + */
>> +    if ((flags & BDRV_REQ_MAY_UNMAP) && (scri.l2_bitmap &
>> l2_bitmap_mask)) {
>> +    return discard_l2_subclusters(bs, offset, nb_subclusters,
>> +  QCOW2_DISCARD_REQUEST, false,
>> );
>> +    }
> 
> I wonder why it matters whether any subclusters are allocated, i.e. why
> can’t we just immediately use discard_l2_subclusters() whenever
> BDRV_REQ_MAY_UNMAP is set, without even fetching SCRI (that would also
> save us passing SCRI here, which I’ve already said on patch 4, I don’t
> find great).
> 

Yes, this way we'd indeed be able to get rid of passing an additional
argument.

> Doesn’t discard_l2_subclusters() guarantee the clusters read as zero
> when full_discard=false?  There is this case where it won’t mark the
> subclusters as zero if there is no backing file, and none of the
> subclusters is allocated.  But still, (A) those subclusters will read as
> zero, and (B) if there is a problem there, why don’t we just always mark
> those subclusters as zero?  This optimization only has effect when the
> guest discards/zeroes subclusters (not whole clusters) that were already
> discarded, so sounds miniscule.
> 
> Hanna
> 

Good question.  I also can't think of any issues when
zeroizing/discarding already unallocated clusters.

Essentially you're saying that zeroizing subclusters is equivalent to
discard_l2_subclusters(full_discard=false).  Then in
discard_l2_subclusters() implementation we may mark the subclusters as
zeroes regardless of their allocation status in case of
full_discard=false.  But when dealing with the whole clusters this logic
isn't quite applicable cause if the l2 entry isn't allocated at all
there's no point marking it as zero.  Correct?

> [...]



Re: [PATCH 7/7] iotests/271: check disk usage on subcluster-based discard/unmap

2023-11-09 Thread Andrey Drobyshev
On 11/3/23 17:51, Hanna Czenczek wrote:
> On 20.10.23 23:56, Andrey Drobyshev wrote:
>> Add _verify_du_delta() checker which is used to check that real disk
>> usage delta meets the expectations.  For now we use it for checking that
>> subcluster-based discard/unmap operations lead to actual disk usage
>> decrease (i.e. PUNCH_HOLE operation is performed).
> 
> I’m not too happy about checking the disk usage because that relies on
> the underlying filesystem actually accepting and executing the unmap. 
> Why is it not enough to check the L2 bitmap?
> 
> …Coming back later (I had to fix the missing `ret = ` I mentioned in
> patch 2, or this test would hang, so I couldn’t run it at first), I note
> that checking the disk usage in fact doesn’t work on tmpfs.  I usually
> run the iotests in tmpfs, so that’s not great.
> 

My original idea was to make sure that the PUNCH_HOLE operation did
indeed take place, i.e. there was an actual discard.  For instance,
currently the discard operation initiated by qemu-io is called with the
QCOW2_DISCARD_REQUEST discard type, but if some other type is passed by
mistake, qcow2_queue_discard() won't be called, and though the
subclusters will be marked unallocated in L2 the data will still be
there.  Not quite what we expect from discard operation.

BTW checking the disk usage on tmpfs works on my machine:

> # cd /tmp; df -Th /tmp
> Filesystem Type   Size  Used Avail Use% Mounted on
> tmpfs  tmpfs   32G  2.5M   32G   1% /tmp
> # BUILD=/root/src/qemu/master/build
> # $BUILD/qemu-img create -f qcow2 -o extended_l2=on img.qcow2 1M
> Formatting 'img.qcow2', fmt=qcow2 cluster_size=65536 extended_l2=on 
> compression_type=zlib size=1048576 lazy_refcounts=off refcount_bits=16
> # $BUILD/qemu-io -c 'write -q 0 128k' img.qcow2
> # du --block-size=1 img.qcow2
> 397312  img.qcow2
> # $BUILD/qemu-io -f qcow2 -c 'discard -q 0 8k' img.qcow2
> # du --block-size=1 img.qcow2
> 389120  img.qcow2
> # $BUILD/qemu-io -f qcow2 -c 'discard -q 8k 120k' img.qcow2
> # du --block-size=1 img.qcow2
> 266240  img.qcow2

I'm wondering what this might depend on and can't we overcome this?

>> Also add separate test case for discarding particular subcluster within
>> one cluster.
>>
>> Signed-off-by: Andrey Drobyshev 
>> ---
>>   tests/qemu-iotests/271 | 25 -
>>   tests/qemu-iotests/271.out |  2 ++
>>   2 files changed, 26 insertions(+), 1 deletion(-)
>>
>> diff --git a/tests/qemu-iotests/271 b/tests/qemu-iotests/271
>> index c7c2cadda0..5fcb209f5f 100755
>> --- a/tests/qemu-iotests/271
>> +++ b/tests/qemu-iotests/271
>> @@ -81,6 +81,15 @@ _verify_l2_bitmap()
>>   fi
>>   }
>>   +# Check disk usage delta after a discard/unmap operation
>> +# _verify_du_delta $before $after $expected_delta
>> +_verify_du_delta()
>> +{
>> +    if [ $(($1 - $2)) -ne $3 ]; then
>> +    printf "ERROR: unexpected delta: $1 - $2 = $(($1 - $2)) != $3\n"
>> +    fi
>> +}
>> +
>>   # This should be called as _run_test c=XXX sc=XXX off=XXX len=XXX
>> cmd=XXX
>>   # c:   cluster number (0 if unset)
>>   # sc:  subcluster number inside cluster @c (0 if unset)
>> @@ -198,9 +207,12 @@ for use_backing_file in yes no; do
>>   alloc="$(seq 0 31)"; zero=""
>>   _run_test sc=0 len=64k
>>   -    ### Zero and unmap half of cluster #0 (this won't unmap it)
>> +    ### Zero and unmap half of cluster #0 (this will unmap it)
> 
> I think “it” refers to the cluster, and it is not unmapped.  This test
> case does not use a discard, but write -z instead, so it worked before. 
> (The L2 bitmap shown in the output doesn’t change, so functionally, this
> patch series didn’t change this case.)
> 

>From the _run_test() implementation:

> # cmd: the command to pass to qemu-io, must be one of 
>   
> #  write-> write  
>   
> #  zero -> write -z   
>   
> #  unmap-> write -z -u   <-   
> 
> #  compress -> write -c   
>   
> #  discard  -> discard
>   
> _run_test()

So it actually uses 'write -z -u', and we end up with an actual unmap.
I agree that the l2 bitmap doesn't change, that's why I specifically
added disk usage check to catch the changed functionality.

>>   alloc="$(seq 16 31)"; zero="$(seq 0 15)"
>> +    before=$(disk_usage "$TEST_I

Re: [PATCH 4/7] qcow2: make subclusters discardable

2023-11-09 Thread Andrey Drobyshev
On 10/31/23 18:32, Hanna Czenczek wrote:
> On 20.10.23 23:56, Andrey Drobyshev wrote:
>> This commit makes the discard operation work on the subcluster level
>> rather than cluster level.  It introduces discard_l2_subclusters()
>> function and makes use of it in qcow2 discard implementation, much like
>> it's done with zero_in_l2_slice() / zero_l2_subclusters().  It also
>> changes the qcow2 driver pdiscard_alignment to subcluster_size.  That
>> way subcluster-aligned discards lead to actual fallocate(PUNCH_HOLE)
>> operation and free host disk space.
>>
>> This feature will let us gain additional disk space on guest
>> TRIM/discard requests, especially when using large enough clusters
>> (1M, 2M) with subclusters enabled.
>>
>> Signed-off-by: Andrey Drobyshev 
>> ---
>>   block/qcow2-cluster.c | 100 --
>>   block/qcow2.c |   8 ++--
>>   2 files changed, 101 insertions(+), 7 deletions(-)
>>
>> diff --git a/block/qcow2-cluster.c b/block/qcow2-cluster.c
>> index 7c6fa5524c..cf40f2dc12 100644
>> --- a/block/qcow2-cluster.c
>> +++ b/block/qcow2-cluster.c
> 
> [...]
> 
>> +    if (scri.l2_bitmap != new_l2_bitmap) {
>> +    set_l2_bitmap(s, scri.l2_slice, scri.l2_index, new_l2_bitmap);
>> +    qcow2_cache_entry_mark_dirty(s->l2_table_cache, scri.l2_slice);
>> +    }
>> +
>> +    if (s->discard_passthrough[type]) {
>> +    qcow2_queue_discard(bs, (scri.l2_entry & L2E_OFFSET_MASK) +
>> +    offset_into_cluster(s, offset),
>> +    nb_subclusters * s->subcluster_size);
> 
> Are we sure that the cluster is allocated, i.e. that scri.l2_entry &
> L2E_OFFSET_MASK != 0?
> 

I think it must be illegal to mark the entire cluster as unallocated and
yet mark some of the subclusters as allocated.  So in the case you
describe we should detect it earlier in the '!(new_l2_bitmap &
QCOW_L2_BITMAP_ALL_ALLOC)' condition and fall back to discard_in_l2_slice().

> As a side note, I guess discard_in_l2_slice() should also use
> qcow2_queue_discard() instead of bdrv_pdiscard() then.
> 

Yes, looks like it.  I'll  make it a separate patch then.

>> +    }
>> +
>> +    ret = 0;
>> +out:
>> +    qcow2_cache_put(s->l2_table_cache, (void **) _slice);
>> +
>> +    return ret;
>> +}
>> +
>>   int qcow2_cluster_discard(BlockDriverState *bs, uint64_t offset,
>>     uint64_t bytes, enum qcow2_discard_type type,
>>     bool full_discard)
>> @@ -2049,19 +2117,36 @@ int qcow2_cluster_discard(BlockDriverState
>> *bs, uint64_t offset,
>>   BDRVQcow2State *s = bs->opaque;
>>   uint64_t end_offset = offset + bytes;
>>   uint64_t nb_clusters;
>> +    unsigned head, tail;
>>   int64_t cleared;
>>   int ret;
>>     /* Caller must pass aligned values, except at image end */
>> -    assert(QEMU_IS_ALIGNED(offset, s->cluster_size));
>> -    assert(QEMU_IS_ALIGNED(end_offset, s->cluster_size) ||
>> +    assert(QEMU_IS_ALIGNED(offset, s->subcluster_size));
>> +    assert(QEMU_IS_ALIGNED(end_offset, s->subcluster_size) ||
>>  end_offset == bs->total_sectors << BDRV_SECTOR_BITS);
>>   -    nb_clusters = size_to_clusters(s, bytes);
>> +    head = MIN(end_offset, ROUND_UP(offset, s->cluster_size)) - offset;
>> +    offset += head;
>> +
>> +    tail = (end_offset >= bs->total_sectors << BDRV_SECTOR_BITS) ? 0 :
>> +   end_offset - MAX(offset, start_of_cluster(s, end_offset));
>> +    end_offset -= tail;
>>     s->cache_discards = true;
>>   +    if (head) {
>> +    ret = discard_l2_subclusters(bs, offset - head,
>> + size_to_subclusters(s, head), type,
>> + full_discard, NULL);
>> +    if (ret < 0) {
>> +    goto fail;
>> +    }
>> +    }
>> +
>>   /* Each L2 slice is handled by its own loop iteration */
>> +    nb_clusters = size_to_clusters(s, end_offset - offset);
>> +
>>   while (nb_clusters > 0) {
> 
> I think the comment should stay attached to the `while`.
> 

Agreed.

> Hanna
> 
>>   cleared = discard_in_l2_slice(bs, offset, nb_clusters, type,
>>     full_discard);
> 




Re: [PATCH 7/7] iotests/271: check disk usage on subcluster-based discard/unmap

2023-11-09 Thread Andrey Drobyshev
On 11/3/23 17:59, Hanna Czenczek wrote:
> On 03.11.23 16:51, Hanna Czenczek wrote:
>> On 20.10.23 23:56, Andrey Drobyshev wrote: 
> 
> [...]
> 
>>> @@ -528,6 +543,14 @@ for use_backing_file in yes no; do
>>>   else
>>>   _make_test_img -o extended_l2=on 1M
>>>   fi
>>> +    # Write cluster #0 and discard its subclusters #0-#3
>>> +    $QEMU_IO -c 'write -q 0 64k' "$TEST_IMG"
>>> +    before=$(disk_usage "$TEST_IMG")
>>> +    $QEMU_IO -c 'discard -q 0 8k' "$TEST_IMG"
>>> +    after=$(disk_usage "$TEST_IMG")
>>> +    _verify_du_delta $before $after 8192
>>> +    alloc="$(seq 4 31)"; zero="$(seq 0 3)"
>>> +    _verify_l2_bitmap 0
>>>   # Write clusters #0-#2 and then discard them
>>>   $QEMU_IO -c 'write -q 0 128k' "$TEST_IMG"
>>>   $QEMU_IO -c 'discard -q 0 128k' "$TEST_IMG"
>>
>> Similarly to above, I think it would be good if we combined this
>> following case with the one you added, i.e. to write 128k from the
>> beginning, drop the write here, and change the discard to be “discard
>> -q 8k 120k”, i.e. skip the subclusters we have already discarded, to
>> see that this is still combined to discard the whole first cluster.
>>
>> ...Ah, see, and when I try this, the following assertion fails:
>>
>> qemu-io: ../block/qcow2-cache.c:156: qcow2_cache_destroy: Assertion
>> `c->entries[i].ref == 0' failed.
>> ./common.rc: line 220: 128894 Aborted (core dumped) (
>> VALGRIND_QEMU="${VALGRIND_QEMU_IO}" _qemu_proc_exec
>> "${VALGRIND_LOGFILE}" "$QEMU_IO_PROG" $QEMU_IO_ARGS "$@" )
>>
>> Looks like an L2 table is leaked somewhere.  That’s why SCRI should be
>> a g_auto()-able type.
> 
> Forgot to add: This single test case here is the only place where we
> test the added functionality.  I think there should be more cases. It
> doesn’t really make sense now that 271 has so many cases for writing
> zeroes, but so few for discarding, now that discarding works on
> subclusters.  Most of them should at least be considered whether we can
> run them for discard as well.
> 
> I didn’t want to push for such an extensive set of tests, but, well, now
> it turned out I overlooked a bug in patch 4, and only found it because I
> thought “this place might also make a nice test case for this series”.
> 
> Hanna
> 

I agree that more coverage should be added.  Based on the previous
email, I see the following cases:

1. Direct 'discard' on the subclusters range (discard_l2_subclusters()).
2. Direct 'discard' on the subclusters range, complementary to an
unallocated range (i.e. discard_l2_subclusters() -> discard_in_l2_slice()).
3. 'write -u -z' on the subclusters range (zero_l2_subclusters() ->
discard_l2_subclusters()).
4. 'write -u -z' on the subclusters range, complementary to an
unallocated range (zero_l2_subclusters() -> discard_l2_subclusters() ->
discard_in_l2_slice()).

Would also be nice to test the zero_l2_subclusters() ->
zero_in_l2_slice() path, but we'd have to somehow check the refcount
table for that since the L2 bitmap doesn't change.

Please let me know if you can think of anything else.



Re: [PATCH 6/7] iotests/common.rc: add disk_usage function

2023-11-09 Thread Andrey Drobyshev
On 11/3/23 17:20, Hanna Czenczek wrote:
> On 20.10.23 23:56, Andrey Drobyshev wrote:
>> Move the definition from iotests/250 to common.rc.  This is used to
>> detect real disk usage of sparse files.  In particular, we want to use
>> it for checking subclusters-based discards.
>>
>> Signed-off-by: Andrey Drobyshev 
>> ---
>>   tests/qemu-iotests/250   | 5 -
>>   tests/qemu-iotests/common.rc | 6 ++
>>   2 files changed, 6 insertions(+), 5 deletions(-)
>>
>> diff --git a/tests/qemu-iotests/250 b/tests/qemu-iotests/250
>> index af48f83aba..c0a0dbc0ff 100755
>> --- a/tests/qemu-iotests/250
>> +++ b/tests/qemu-iotests/250
>> @@ -52,11 +52,6 @@ _unsupported_imgopts data_file
>>   # bdrv_co_truncate(bs->file) call in qcow2_co_truncate(), which
>> might succeed
>>   # anyway.
>>   -disk_usage()
>> -{
>> -    du --block-size=1 $1 | awk '{print $1}'
>> -}
>> -
>>   size=2100M
>>     _make_test_img -o "cluster_size=1M,preallocation=metadata" $size
>> diff --git a/tests/qemu-iotests/common.rc b/tests/qemu-iotests/common.rc
>> index 95c12577dd..5d2ea26c7f 100644
>> --- a/tests/qemu-iotests/common.rc
>> +++ b/tests/qemu-iotests/common.rc
>> @@ -140,6 +140,12 @@ _optstr_add()
>>   fi
>>   }
>>   +# report real disk usage for sparse files
>> +disk_usage()
>> +{
>> +    du --block-size=1 $1 | awk '{print $1}'
> 
> Pre-existing, but since you’re touching this now: Can you please change
> the $1 to "$1"?
>

Sure, will do in v2.

> Hanna
> 
> [...]



Re: [PATCH 2/7] qcow2: add get_sc_range_info() helper for working with subcluster ranges

2023-11-09 Thread Andrey Drobyshev
Hello Hanna,

Sorry for the delay and thanks for your thorough and detailed review.

On 10/31/23 17:53, Hanna Czenczek wrote:
> On 20.10.23 23:56, Andrey Drobyshev wrote:
>> This helper simply obtains the l2 table parameters of the cluster which
>> contains the given subclusters range.  Right now this info is being
>> obtained and used by zero_l2_subclusters().  As we're about to introduce
>> the subclusters discard operation, this helper would let us avoid code
>> duplication.
>>
>> Also introduce struct SubClusterRangeInfo, which would contain all the
>> needed params.
>>
>> Signed-off-by: Andrey Drobyshev 
>> ---
>>   block/qcow2-cluster.c | 90 +--
>>   1 file changed, 62 insertions(+), 28 deletions(-)
>>
>> diff --git a/block/qcow2-cluster.c b/block/qcow2-cluster.c
>> index 904f00d1b3..8801856b93 100644
>> --- a/block/qcow2-cluster.c
>> +++ b/block/qcow2-cluster.c
>> @@ -32,6 +32,13 @@
>>   #include "qemu/memalign.h"
>>   #include "trace.h"
>>   +typedef struct SubClusterRangeInfo {
>> +    uint64_t *l2_slice;
> 
> We should document that this is a strong reference that must be returned
> via qcow2_cache_put().  Maybe you could also define a clean-up function
> using G_DEFINE_AUTO_CLEANUP_CLEAR_FUNC() that does this, allowing this
> type to be used with g_auto().
> 
>> +    int l2_index;
>> +    uint64_t l2_entry;
>> +    uint64_t l2_bitmap;
>> +} SubClusterRangeInfo;
>> +
>>   int coroutine_fn qcow2_shrink_l1_table(BlockDriverState *bs,
>>  uint64_t exact_size)
>>   {
>> @@ -1892,6 +1899,50 @@ again:
>>   return 0;
>>   }
>>   +static int get_sc_range_info(BlockDriverState *bs, uint64_t offset,
>> + unsigned nb_subclusters,
>> + SubClusterRangeInfo *scri)
> 
> It would be good to have documentation for this function, for example
> that it only works on a single cluster, i.e. that the range denoted by
> @offset and @nb_subclusters must not cross a cluster boundary, and that
> @offset must be aligned to subclusters.
> 

I figured out those restrictions are derived from the number of asserts
in the function body itself, much like it's done in
zero_l2_subclusters() and friends.  But of course I don't mind
documenting this.

> In general, it is unclear to me at this point what this function does.

The sole purpose of this function is to avoid the code duplication,
since both zero_l2_subclusters() (pre-existing) and
discard_l2_subclusters() (newly introduced) need to obtain the same info
about the subclusters range they're working with.
 
> OK, it gets the SCRI for all subclusters in the cluster at @offset (this
> is what its name implies), but then it also has this loop that checks
> whether there are compressed clusters among the @nb_subclusters.

Look at the pre-existing implementation of zero_l2_subclusters(): it
also checks that the subcluster range we're dealing with isn't contained
within a compressed cluster (otherwise there's no point zeroizing
individual subclusters).  So the logic isn't changed here.  The only
reason I decided to loop through the subclusters (instead of calling
qcow2_get_cluster_type() for the whole cluster) is so that I could
detect invalid subclusters and return -EINVAL right away.

> It has a comment about being unable to zero/discard subclusters in compressed
> clusters, but the function name says nothing about this scope of
> zeroing/discarding.
> 

Maybe rename it then to stress out that we're dealing with the regular
subclusters only? get_normal_sc_range_info()?

>> +{
>> +    BDRVQcow2State *s = bs->opaque;
>> +    int ret, sc_cleared = 0, sc_index = offset_to_sc_index(s, offset);
>> +    QCow2SubclusterType sctype;
>> +
>> +    /* Here we only work with the subclusters within single cluster. */
>> +    assert(nb_subclusters > 0 && nb_subclusters <
>> s->subclusters_per_cluster);
>> +    assert(sc_index + nb_subclusters <= s->subclusters_per_cluster);
>> +    assert(offset_into_subcluster(s, offset) == 0);
>> +
>> +    ret = get_cluster_table(bs, offset, >l2_slice,
>> >l2_index);
>> +    if (ret < 0) {
>> +    return ret;
>> +    }
>> +
>> +    scri->l2_entry = get_l2_entry(s, scri->l2_slice, scri->l2_index);
>> +    scri->l2_bitmap = get_l2_bitmap(s, scri->l2_slice, scri->l2_index);
>> +
>> +    do {
>> +    qcow2_get_subcluster_range_type(bs, scri->l2_entry,
>> scri->l2_bitmap,
>> +

Re: [PATCH v3 1/8] qemu-img: rebase: stop when reaching EOF of old backing file

2023-11-01 Thread Andrey Drobyshev
On 11/1/23 11:50, Michael Tokarev wrote:
> 19.09.2023 19:57, Andrey Drobyshev via wrote:
>> In case when we're rebasing within one backing chain, and when target
>> image
>> is larger than old backing file, bdrv_is_allocated_above() ends up
>> setting
>> *pnum = 0.  As a result, target offset isn't getting incremented, and we
>> get stuck in an infinite for loop.  Let's detect this case and proceed
>> further down the loop body, as the offsets beyond the old backing size
>> need
>> to be explicitly zeroed.
>>
>> Signed-off-by: Andrey Drobyshev 
>> Reviewed-by: Denis V. Lunev 
>> Reviewed-by: Hanna Czenczek 
> 
> Is this change not for -stable anymore?  First version of this patch has
> been
> Cc'd to stable, this one is not.
> 
> Thanks,
> 
> /mjt

Hi Michael,

Since this series is already merged in master, I'm not sure whether it's
necessary to forward this particular patch to qemu-stable, or it should
rather be cherry-picked to -stable by one of the block maintainers.

Kevin, maybe you can clarify?

Andrey



[PATCH] kvm: emit GUEST_PANICKED event in case of abnormal KVM exit

2023-11-01 Thread Andrey Drobyshev
Currently we emit GUEST_PANICKED event in case kvm_vcpu_ioctl() returns
KVM_EXIT_SYSTEM_EVENT with the event type KVM_SYSTEM_EVENT_CRASH.  Let's
extend this scenario and emit GUEST_PANICKED in case of an abnormal KVM
exit.  That's a natural thing to do since in this case guest is no
longer operational anyway.

Signed-off-by: Andrey Drobyshev 
Acked-by: Denis V. Lunev 
---
 accel/kvm/kvm-all.c | 19 +++
 1 file changed, 15 insertions(+), 4 deletions(-)

diff --git a/accel/kvm/kvm-all.c b/accel/kvm/kvm-all.c
index e39a810a4e..d74b3f0b0e 100644
--- a/accel/kvm/kvm-all.c
+++ b/accel/kvm/kvm-all.c
@@ -2816,6 +2816,14 @@ static void kvm_eat_signals(CPUState *cpu)
 } while (sigismember(, SIG_IPI));
 }
 
+static void kvm_emit_guest_crash(CPUState *cpu)
+{
+kvm_cpu_synchronize_state(cpu);
+qemu_mutex_lock_iothread();
+qemu_system_guest_panicked(cpu_get_crash_info(cpu));
+qemu_mutex_unlock_iothread();
+}
+
 int kvm_cpu_exec(CPUState *cpu)
 {
 struct kvm_run *run = cpu->kvm_run;
@@ -2969,21 +2977,24 @@ int kvm_cpu_exec(CPUState *cpu)
 ret = EXCP_INTERRUPT;
 break;
 case KVM_SYSTEM_EVENT_CRASH:
-kvm_cpu_synchronize_state(cpu);
-qemu_mutex_lock_iothread();
-qemu_system_guest_panicked(cpu_get_crash_info(cpu));
-qemu_mutex_unlock_iothread();
+kvm_emit_guest_crash(cpu);
 ret = 0;
 break;
 default:
 DPRINTF("kvm_arch_handle_exit\n");
 ret = kvm_arch_handle_exit(cpu, run);
+if (ret < 0) {
+kvm_emit_guest_crash(cpu);
+}
 break;
 }
 break;
 default:
 DPRINTF("kvm_arch_handle_exit\n");
 ret = kvm_arch_handle_exit(cpu, run);
+if (ret < 0) {
+kvm_emit_guest_crash(cpu);
+}
 break;
 }
 } while (ret == 0);
-- 
2.39.3




Re: [PATCH 1/6] qemu-img: rebase: stop when reaching EOF of old backing file

2023-10-26 Thread Andrey Drobyshev
On 10/26/23 09:32, Michael Tokarev wrote:
> 01.06.2023 22:28, Andrey Drobyshev via:
>> In case when we're rebasing within one backing chain, and when target
>> image
>> is larger than old backing file, bdrv_is_allocated_above() ends up
>> setting
>> *pnum = 0.  As a result, target offset isn't getting incremented, and we
>> get stuck in an infinite for loop.  Let's detect this case and proceed
>> further down the loop body, as the offsets beyond the old backing size
>> need
>> to be explicitly zeroed.
> 
> Ping? Has this been forgotten? It's a few months already..
> 
> /mjt

Hi Michael,

It's not forgotten, there's already v3 of this series and it's already
taken to the block branch by Kevin:

https://lists.nongnu.org/archive/html/qemu-block/2023-09/msg00593.html

Andrey



[PATCH 7/7] iotests/271: check disk usage on subcluster-based discard/unmap

2023-10-20 Thread Andrey Drobyshev
Add _verify_du_delta() checker which is used to check that real disk
usage delta meets the expectations.  For now we use it for checking that
subcluster-based discard/unmap operations lead to actual disk usage
decrease (i.e. PUNCH_HOLE operation is performed).

Also add separate test case for discarding particular subcluster within
one cluster.

Signed-off-by: Andrey Drobyshev 
---
 tests/qemu-iotests/271 | 25 -
 tests/qemu-iotests/271.out |  2 ++
 2 files changed, 26 insertions(+), 1 deletion(-)

diff --git a/tests/qemu-iotests/271 b/tests/qemu-iotests/271
index c7c2cadda0..5fcb209f5f 100755
--- a/tests/qemu-iotests/271
+++ b/tests/qemu-iotests/271
@@ -81,6 +81,15 @@ _verify_l2_bitmap()
 fi
 }
 
+# Check disk usage delta after a discard/unmap operation
+# _verify_du_delta $before $after $expected_delta
+_verify_du_delta()
+{
+if [ $(($1 - $2)) -ne $3 ]; then
+printf "ERROR: unexpected delta: $1 - $2 = $(($1 - $2)) != $3\n"
+fi
+}
+
 # This should be called as _run_test c=XXX sc=XXX off=XXX len=XXX cmd=XXX
 # c:   cluster number (0 if unset)
 # sc:  subcluster number inside cluster @c (0 if unset)
@@ -198,9 +207,12 @@ for use_backing_file in yes no; do
 alloc="$(seq 0 31)"; zero=""
 _run_test sc=0 len=64k
 
-### Zero and unmap half of cluster #0 (this won't unmap it)
+### Zero and unmap half of cluster #0 (this will unmap it)
 alloc="$(seq 16 31)"; zero="$(seq 0 15)"
+before=$(disk_usage "$TEST_IMG")
 _run_test sc=0 len=32k cmd=unmap
+after=$(disk_usage "$TEST_IMG")
+_verify_du_delta $before $after 32768
 
 ### Zero and unmap cluster #0
 alloc=""; zero="$(seq 0 31)"
@@ -447,7 +459,10 @@ for use_backing_file in yes no; do
 
 # Subcluster-aligned request from clusters #12 to #14
 alloc="$(seq 0 15)"; zero="$(seq 16 31)"
+before=$(disk_usage "$TEST_IMG")
 _run_test c=12 sc=16 len=128k cmd=unmap
+after=$(disk_usage "$TEST_IMG")
+_verify_du_delta $before $after $((128 * 1024))
 alloc=""; zero="$(seq 0 31)"
 _verify_l2_bitmap 13
 alloc="$(seq 16 31)"; zero="$(seq 0 15)"
@@ -528,6 +543,14 @@ for use_backing_file in yes no; do
 else
 _make_test_img -o extended_l2=on 1M
 fi
+# Write cluster #0 and discard its subclusters #0-#3
+$QEMU_IO -c 'write -q 0 64k' "$TEST_IMG"
+before=$(disk_usage "$TEST_IMG")
+$QEMU_IO -c 'discard -q 0 8k' "$TEST_IMG"
+after=$(disk_usage "$TEST_IMG")
+_verify_du_delta $before $after 8192
+alloc="$(seq 4 31)"; zero="$(seq 0 3)"
+_verify_l2_bitmap 0
 # Write clusters #0-#2 and then discard them
 $QEMU_IO -c 'write -q 0 128k' "$TEST_IMG"
 $QEMU_IO -c 'discard -q 0 128k' "$TEST_IMG"
diff --git a/tests/qemu-iotests/271.out b/tests/qemu-iotests/271.out
index 5be780de76..0da8d72cde 100644
--- a/tests/qemu-iotests/271.out
+++ b/tests/qemu-iotests/271.out
@@ -426,6 +426,7 @@ L2 entry #29: 0x 
 ### Discarding clusters with non-zero bitmaps (backing file: yes) ###
 
 Formatting 'TEST_DIR/t.IMGFMT', fmt=IMGFMT size=1048576 
backing_file=TEST_DIR/t.IMGFMT.base backing_fmt=raw
+L2 entry #0: 0x8005 0000
 L2 entry #0: 0x 
 L2 entry #1: 0x 
 Image resized.
@@ -436,6 +437,7 @@ L2 entry #1: 0x 
 ### Discarding clusters with non-zero bitmaps (backing file: no) ###
 
 Formatting 'TEST_DIR/t.IMGFMT', fmt=IMGFMT size=1048576
+L2 entry #0: 0x8005 0000
 L2 entry #0: 0x 
 L2 entry #1: 0x 
 Image resized.
-- 
2.39.3




[PATCH 3/7] qcow2: zeroize the entire cluster when there're no non-zero subclusters

2023-10-20 Thread Andrey Drobyshev
When zeroizing the last non-zero subclusters within single cluster, it
makes sense to go zeroize the entire cluster and go down zero_in_l2_slice()
path right away.  That way we'd also update the corresponding refcount
table.

Signed-off-by: Andrey Drobyshev 
---
 block/qcow2-cluster.c | 18 +++---
 1 file changed, 15 insertions(+), 3 deletions(-)

diff --git a/block/qcow2-cluster.c b/block/qcow2-cluster.c
index 8801856b93..7c6fa5524c 100644
--- a/block/qcow2-cluster.c
+++ b/block/qcow2-cluster.c
@@ -2145,7 +2145,7 @@ zero_in_l2_slice(BlockDriverState *bs, uint64_t offset,
 
 static int coroutine_fn GRAPH_RDLOCK
 zero_l2_subclusters(BlockDriverState *bs, uint64_t offset,
-unsigned nb_subclusters)
+unsigned nb_subclusters, int flags)
 {
 BDRVQcow2State *s = bs->opaque;
 uint64_t new_l2_bitmap;
@@ -2161,6 +2161,17 @@ zero_l2_subclusters(BlockDriverState *bs, uint64_t 
offset,
 new_l2_bitmap |=  QCOW_OFLAG_SUB_ZERO_RANGE(sc, sc + nb_subclusters);
 new_l2_bitmap &= ~QCOW_OFLAG_SUB_ALLOC_RANGE(sc, sc + nb_subclusters);
 
+/*
+ * If there're no non-zero subclusters left, we might as well zeroize
+ * the entire cluster.  That way we'd also update the refcount table.
+ */
+if ((new_l2_bitmap & QCOW_L2_BITMAP_ALL_ZEROES) ==
+QCOW_L2_BITMAP_ALL_ZEROES) {
+qcow2_cache_put(s->l2_table_cache, (void **) _slice);
+return zero_in_l2_slice(bs, QEMU_ALIGN_DOWN(offset, s->cluster_size),
+1, flags);
+}
+
 if (new_l2_bitmap != scri.l2_bitmap) {
 set_l2_bitmap(s, scri.l2_slice, scri.l2_index, new_l2_bitmap);
 qcow2_cache_entry_mark_dirty(s->l2_table_cache, scri.l2_slice);
@@ -2221,7 +2232,7 @@ int coroutine_fn 
qcow2_subcluster_zeroize(BlockDriverState *bs, uint64_t offset,
 
 if (head) {
 ret = zero_l2_subclusters(bs, offset - head,
-  size_to_subclusters(s, head));
+  size_to_subclusters(s, head), flags);
 if (ret < 0) {
 goto fail;
 }
@@ -2242,7 +2253,8 @@ int coroutine_fn 
qcow2_subcluster_zeroize(BlockDriverState *bs, uint64_t offset,
 }
 
 if (tail) {
-ret = zero_l2_subclusters(bs, end_offset, size_to_subclusters(s, 
tail));
+ret = zero_l2_subclusters(bs, end_offset,
+  size_to_subclusters(s, tail), flags);
 if (ret < 0) {
 goto fail;
 }
-- 
2.39.3




[PATCH 2/7] qcow2: add get_sc_range_info() helper for working with subcluster ranges

2023-10-20 Thread Andrey Drobyshev
This helper simply obtains the l2 table parameters of the cluster which
contains the given subclusters range.  Right now this info is being
obtained and used by zero_l2_subclusters().  As we're about to introduce
the subclusters discard operation, this helper would let us avoid code
duplication.

Also introduce struct SubClusterRangeInfo, which would contain all the
needed params.

Signed-off-by: Andrey Drobyshev 
---
 block/qcow2-cluster.c | 90 +--
 1 file changed, 62 insertions(+), 28 deletions(-)

diff --git a/block/qcow2-cluster.c b/block/qcow2-cluster.c
index 904f00d1b3..8801856b93 100644
--- a/block/qcow2-cluster.c
+++ b/block/qcow2-cluster.c
@@ -32,6 +32,13 @@
 #include "qemu/memalign.h"
 #include "trace.h"
 
+typedef struct SubClusterRangeInfo {
+uint64_t *l2_slice;
+int l2_index;
+uint64_t l2_entry;
+uint64_t l2_bitmap;
+} SubClusterRangeInfo;
+
 int coroutine_fn qcow2_shrink_l1_table(BlockDriverState *bs,
uint64_t exact_size)
 {
@@ -1892,6 +1899,50 @@ again:
 return 0;
 }
 
+static int get_sc_range_info(BlockDriverState *bs, uint64_t offset,
+ unsigned nb_subclusters,
+ SubClusterRangeInfo *scri)
+{
+BDRVQcow2State *s = bs->opaque;
+int ret, sc_cleared = 0, sc_index = offset_to_sc_index(s, offset);
+QCow2SubclusterType sctype;
+
+/* Here we only work with the subclusters within single cluster. */
+assert(nb_subclusters > 0 && nb_subclusters < s->subclusters_per_cluster);
+assert(sc_index + nb_subclusters <= s->subclusters_per_cluster);
+assert(offset_into_subcluster(s, offset) == 0);
+
+ret = get_cluster_table(bs, offset, >l2_slice, >l2_index);
+if (ret < 0) {
+return ret;
+}
+
+scri->l2_entry = get_l2_entry(s, scri->l2_slice, scri->l2_index);
+scri->l2_bitmap = get_l2_bitmap(s, scri->l2_slice, scri->l2_index);
+
+do {
+qcow2_get_subcluster_range_type(bs, scri->l2_entry, scri->l2_bitmap,
+sc_index, );
+if (ret < 0) {
+return ret;
+}
+
+switch (sctype) {
+case QCOW2_SUBCLUSTER_COMPRESSED:
+/* We cannot partially zeroize/discard compressed clusters. */
+return -ENOTSUP;
+case QCOW2_SUBCLUSTER_INVALID:
+return -EINVAL;
+default:
+break;
+}
+
+sc_cleared += ret;
+} while (sc_cleared < nb_subclusters);
+
+return 0;
+}
+
 /*
  * This discards as many clusters of nb_clusters as possible at once (i.e.
  * all clusters in the same L2 slice) and returns the number of discarded
@@ -2097,44 +2148,27 @@ zero_l2_subclusters(BlockDriverState *bs, uint64_t 
offset,
 unsigned nb_subclusters)
 {
 BDRVQcow2State *s = bs->opaque;
-uint64_t *l2_slice;
-uint64_t old_l2_bitmap, l2_bitmap;
-int l2_index, ret, sc = offset_to_sc_index(s, offset);
+uint64_t new_l2_bitmap;
+int ret, sc = offset_to_sc_index(s, offset);
+SubClusterRangeInfo scri = { 0 };
 
-/* For full clusters use zero_in_l2_slice() instead */
-assert(nb_subclusters > 0 && nb_subclusters < s->subclusters_per_cluster);
-assert(sc + nb_subclusters <= s->subclusters_per_cluster);
-assert(offset_into_subcluster(s, offset) == 0);
-
-ret = get_cluster_table(bs, offset, _slice, _index);
+ret = get_sc_range_info(bs, offset, nb_subclusters, );
 if (ret < 0) {
-return ret;
-}
-
-switch (qcow2_get_cluster_type(bs, get_l2_entry(s, l2_slice, l2_index))) {
-case QCOW2_CLUSTER_COMPRESSED:
-ret = -ENOTSUP; /* We cannot partially zeroize compressed clusters */
 goto out;
-case QCOW2_CLUSTER_NORMAL:
-case QCOW2_CLUSTER_UNALLOCATED:
-break;
-default:
-g_assert_not_reached();
 }
 
-old_l2_bitmap = l2_bitmap = get_l2_bitmap(s, l2_slice, l2_index);
-
-l2_bitmap |=  QCOW_OFLAG_SUB_ZERO_RANGE(sc, sc + nb_subclusters);
-l2_bitmap &= ~QCOW_OFLAG_SUB_ALLOC_RANGE(sc, sc + nb_subclusters);
+new_l2_bitmap = scri.l2_bitmap;
+new_l2_bitmap |=  QCOW_OFLAG_SUB_ZERO_RANGE(sc, sc + nb_subclusters);
+new_l2_bitmap &= ~QCOW_OFLAG_SUB_ALLOC_RANGE(sc, sc + nb_subclusters);
 
-if (old_l2_bitmap != l2_bitmap) {
-set_l2_bitmap(s, l2_slice, l2_index, l2_bitmap);
-qcow2_cache_entry_mark_dirty(s->l2_table_cache, l2_slice);
+if (new_l2_bitmap != scri.l2_bitmap) {
+set_l2_bitmap(s, scri.l2_slice, scri.l2_index, new_l2_bitmap);
+qcow2_cache_entry_mark_dirty(s->l2_table_cache, scri.l2_slice);
 }
 
 ret = 0;
 out:
-qcow2_cache_put(s->l2_table_cache, (void **) _slice);
+qcow2_cache_put(s->l2_table_cache, (void **) _slice);
 
 return ret;
 }
-- 
2.39.3




[PATCH 5/7] qcow2: zero_l2_subclusters: fall through to discard operation when requested

2023-10-20 Thread Andrey Drobyshev
When zeroizing subclusters within single cluster, detect usage of the
BDRV_REQ_MAY_UNMAP flag and fall through to the subcluster-based discard
operation, much like it's done with the cluster-based discards.  That
way subcluster-aligned operations "qemu-io -c 'write -z -u ...'" will
lead to actual unmap.

Signed-off-by: Andrey Drobyshev 
---
 block/qcow2-cluster.c | 17 ++---
 1 file changed, 14 insertions(+), 3 deletions(-)

diff --git a/block/qcow2-cluster.c b/block/qcow2-cluster.c
index cf40f2dc12..040251f2c3 100644
--- a/block/qcow2-cluster.c
+++ b/block/qcow2-cluster.c
@@ -2242,7 +2242,7 @@ zero_l2_subclusters(BlockDriverState *bs, uint64_t offset,
 unsigned nb_subclusters, int flags)
 {
 BDRVQcow2State *s = bs->opaque;
-uint64_t new_l2_bitmap;
+uint64_t new_l2_bitmap, l2_bitmap_mask;
 int ret, sc = offset_to_sc_index(s, offset);
 SubClusterRangeInfo scri = { 0 };
 
@@ -2251,9 +2251,10 @@ zero_l2_subclusters(BlockDriverState *bs, uint64_t 
offset,
 goto out;
 }
 
+l2_bitmap_mask = QCOW_OFLAG_SUB_ALLOC_RANGE(sc, sc + nb_subclusters);
 new_l2_bitmap = scri.l2_bitmap;
-new_l2_bitmap |=  QCOW_OFLAG_SUB_ZERO_RANGE(sc, sc + nb_subclusters);
-new_l2_bitmap &= ~QCOW_OFLAG_SUB_ALLOC_RANGE(sc, sc + nb_subclusters);
+new_l2_bitmap |= QCOW_OFLAG_SUB_ZERO_RANGE(sc, sc + nb_subclusters);
+new_l2_bitmap &= ~l2_bitmap_mask;
 
 /*
  * If there're no non-zero subclusters left, we might as well zeroize
@@ -2266,6 +2267,16 @@ zero_l2_subclusters(BlockDriverState *bs, uint64_t 
offset,
 1, flags);
 }
 
+/*
+ * If the request allows discarding subclusters and they're actually
+ * allocated, we go down the discard path since after the discard
+ * operation the subclusters are going to be read as zeroes anyway.
+ */
+if ((flags & BDRV_REQ_MAY_UNMAP) && (scri.l2_bitmap & l2_bitmap_mask)) {
+return discard_l2_subclusters(bs, offset, nb_subclusters,
+  QCOW2_DISCARD_REQUEST, false, );
+}
+
 if (new_l2_bitmap != scri.l2_bitmap) {
 set_l2_bitmap(s, scri.l2_slice, scri.l2_index, new_l2_bitmap);
 qcow2_cache_entry_mark_dirty(s->l2_table_cache, scri.l2_slice);
-- 
2.39.3




[PATCH 0/7] qcow2: make subclusters discardable

2023-10-20 Thread Andrey Drobyshev
This series extends the discard operation for qcow2 driver so that it
can be used to discard individial subclusters.  This feature might prove
useful and gain some host disk space when dealing with the guest
TRIM/discard requests, especially with large clusters (1M, 2M) when
subclusters are enabled.

Andrey Drobyshev (7):
  qcow2: make function update_refcount_discard() global
  qcow2: add get_sc_range_info() helper for working with subcluster
ranges
  qcow2: zeroize the entire cluster when there're no non-zero
subclusters
  qcow2: make subclusters discardable
  qcow2: zero_l2_subclusters: fall through to discard operation when
requested
  iotests/common.rc: add disk_usage function
  iotests/271: check disk usage on subcluster-based discard/unmap

 block/qcow2-cluster.c| 217 +--
 block/qcow2-refcount.c   |   8 +-
 block/qcow2.c|   8 +-
 block/qcow2.h|   2 +
 tests/qemu-iotests/250   |   5 -
 tests/qemu-iotests/271   |  25 +++-
 tests/qemu-iotests/271.out   |   2 +
 tests/qemu-iotests/common.rc |   6 +
 8 files changed, 226 insertions(+), 47 deletions(-)

-- 
2.39.3




[PATCH 4/7] qcow2: make subclusters discardable

2023-10-20 Thread Andrey Drobyshev
This commit makes the discard operation work on the subcluster level
rather than cluster level.  It introduces discard_l2_subclusters()
function and makes use of it in qcow2 discard implementation, much like
it's done with zero_in_l2_slice() / zero_l2_subclusters().  It also
changes the qcow2 driver pdiscard_alignment to subcluster_size.  That
way subcluster-aligned discards lead to actual fallocate(PUNCH_HOLE)
operation and free host disk space.

This feature will let us gain additional disk space on guest
TRIM/discard requests, especially when using large enough clusters
(1M, 2M) with subclusters enabled.

Signed-off-by: Andrey Drobyshev 
---
 block/qcow2-cluster.c | 100 --
 block/qcow2.c |   8 ++--
 2 files changed, 101 insertions(+), 7 deletions(-)

diff --git a/block/qcow2-cluster.c b/block/qcow2-cluster.c
index 7c6fa5524c..cf40f2dc12 100644
--- a/block/qcow2-cluster.c
+++ b/block/qcow2-cluster.c
@@ -2042,6 +2042,74 @@ discard_in_l2_slice(BlockDriverState *bs, uint64_t 
offset, uint64_t nb_clusters,
 return nb_clusters;
 }
 
+static int coroutine_fn GRAPH_RDLOCK
+discard_l2_subclusters(BlockDriverState *bs, uint64_t offset,
+   uint64_t nb_subclusters,
+   enum qcow2_discard_type type,
+   bool full_discard,
+   SubClusterRangeInfo *pscri)
+{
+BDRVQcow2State *s = bs->opaque;
+uint64_t new_l2_bitmap, l2_bitmap_mask;
+int ret, sc = offset_to_sc_index(s, offset);
+SubClusterRangeInfo scri = { 0 };
+
+if (!pscri) {
+ret = get_sc_range_info(bs, offset, nb_subclusters, );
+if (ret < 0) {
+goto out;
+}
+} else {
+scri = *pscri;
+}
+
+l2_bitmap_mask = QCOW_OFLAG_SUB_ALLOC_RANGE(sc, sc + nb_subclusters);
+new_l2_bitmap = scri.l2_bitmap;
+new_l2_bitmap &= ~l2_bitmap_mask;
+
+/*
+ * If there're no allocated subclusters left, we might as well discard
+ * the entire cluster.  That way we'd also update the refcount table.
+ */
+if (!(new_l2_bitmap & QCOW_L2_BITMAP_ALL_ALLOC)) {
+return discard_in_l2_slice(bs,
+   QEMU_ALIGN_DOWN(offset, s->cluster_size),
+   1, type, full_discard);
+}
+
+/*
+ * Full discard means we fall through to the backing file, thus we only
+ * need to mark the subclusters as deallocated.
+ *
+ * Non-full discard means subclusters should be explicitly marked as
+ * zeroes.  In this case QCOW2 specification requires the corresponding
+ * allocation status bits to be unset as well.  If the subclusters are
+ * deallocated in the first place and there's no backing, the operation
+ * can be skipped.
+ */
+if (!full_discard &&
+(bs->backing || scri.l2_bitmap & l2_bitmap_mask)) {
+new_l2_bitmap |= QCOW_OFLAG_SUB_ZERO_RANGE(sc, sc + nb_subclusters);
+}
+
+if (scri.l2_bitmap != new_l2_bitmap) {
+set_l2_bitmap(s, scri.l2_slice, scri.l2_index, new_l2_bitmap);
+qcow2_cache_entry_mark_dirty(s->l2_table_cache, scri.l2_slice);
+}
+
+if (s->discard_passthrough[type]) {
+qcow2_queue_discard(bs, (scri.l2_entry & L2E_OFFSET_MASK) +
+offset_into_cluster(s, offset),
+nb_subclusters * s->subcluster_size);
+}
+
+ret = 0;
+out:
+qcow2_cache_put(s->l2_table_cache, (void **) _slice);
+
+return ret;
+}
+
 int qcow2_cluster_discard(BlockDriverState *bs, uint64_t offset,
   uint64_t bytes, enum qcow2_discard_type type,
   bool full_discard)
@@ -2049,19 +2117,36 @@ int qcow2_cluster_discard(BlockDriverState *bs, 
uint64_t offset,
 BDRVQcow2State *s = bs->opaque;
 uint64_t end_offset = offset + bytes;
 uint64_t nb_clusters;
+unsigned head, tail;
 int64_t cleared;
 int ret;
 
 /* Caller must pass aligned values, except at image end */
-assert(QEMU_IS_ALIGNED(offset, s->cluster_size));
-assert(QEMU_IS_ALIGNED(end_offset, s->cluster_size) ||
+assert(QEMU_IS_ALIGNED(offset, s->subcluster_size));
+assert(QEMU_IS_ALIGNED(end_offset, s->subcluster_size) ||
end_offset == bs->total_sectors << BDRV_SECTOR_BITS);
 
-nb_clusters = size_to_clusters(s, bytes);
+head = MIN(end_offset, ROUND_UP(offset, s->cluster_size)) - offset;
+offset += head;
+
+tail = (end_offset >= bs->total_sectors << BDRV_SECTOR_BITS) ? 0 :
+   end_offset - MAX(offset, start_of_cluster(s, end_offset));
+end_offset -= tail;
 
 s->cache_discards = true;
 
+if (head) {
+ret = discard_l2_subclusters(bs, offset - head,
+ size_to_subclusters(s, head), type,
+ full_discard, NULL);
+i

[PATCH 1/7] qcow2: make function update_refcount_discard() global

2023-10-20 Thread Andrey Drobyshev
We are going to need it for discarding separate subclusters.  The
function itself doesn't do anything with the refcount tables, it simply
adds a discard request to the queue, so rename it to qcow2_queue_discard().

Signed-off-by: Andrey Drobyshev 
---
 block/qcow2-refcount.c | 8 
 block/qcow2.h  | 2 ++
 2 files changed, 6 insertions(+), 4 deletions(-)

diff --git a/block/qcow2-refcount.c b/block/qcow2-refcount.c
index 0266542cee..2026f5fa21 100644
--- a/block/qcow2-refcount.c
+++ b/block/qcow2-refcount.c
@@ -754,8 +754,8 @@ void qcow2_process_discards(BlockDriverState *bs, int ret)
 }
 }
 
-static void update_refcount_discard(BlockDriverState *bs,
-uint64_t offset, uint64_t length)
+void qcow2_queue_discard(BlockDriverState *bs, uint64_t offset,
+ uint64_t length)
 {
 BDRVQcow2State *s = bs->opaque;
 Qcow2DiscardRegion *d, *p, *next;
@@ -902,7 +902,7 @@ update_refcount(BlockDriverState *bs, int64_t offset, 
int64_t length,
 }
 
 if (s->discard_passthrough[type]) {
-update_refcount_discard(bs, cluster_offset, s->cluster_size);
+qcow2_queue_discard(bs, cluster_offset, s->cluster_size);
 }
 }
 }
@@ -3619,7 +3619,7 @@ qcow2_discard_refcount_block(BlockDriverState *bs, 
uint64_t discard_block_offs)
 /* discard refblock from the cache if refblock is cached */
 qcow2_cache_discard(s->refcount_block_cache, refblock);
 }
-update_refcount_discard(bs, discard_block_offs, s->cluster_size);
+qcow2_queue_discard(bs, discard_block_offs, s->cluster_size);
 
 return 0;
 }
diff --git a/block/qcow2.h b/block/qcow2.h
index 29958c512b..75d6a1b61b 100644
--- a/block/qcow2.h
+++ b/block/qcow2.h
@@ -891,6 +891,8 @@ int coroutine_fn qcow2_check_refcounts(BlockDriverState 
*bs, BdrvCheckResult *re
BdrvCheckMode fix);
 
 void qcow2_process_discards(BlockDriverState *bs, int ret);
+void qcow2_queue_discard(BlockDriverState *bs, uint64_t offset,
+ uint64_t length);
 
 int qcow2_check_metadata_overlap(BlockDriverState *bs, int ign, int64_t offset,
  int64_t size);
-- 
2.39.3




[PATCH 6/7] iotests/common.rc: add disk_usage function

2023-10-20 Thread Andrey Drobyshev
Move the definition from iotests/250 to common.rc.  This is used to
detect real disk usage of sparse files.  In particular, we want to use
it for checking subclusters-based discards.

Signed-off-by: Andrey Drobyshev 
---
 tests/qemu-iotests/250   | 5 -
 tests/qemu-iotests/common.rc | 6 ++
 2 files changed, 6 insertions(+), 5 deletions(-)

diff --git a/tests/qemu-iotests/250 b/tests/qemu-iotests/250
index af48f83aba..c0a0dbc0ff 100755
--- a/tests/qemu-iotests/250
+++ b/tests/qemu-iotests/250
@@ -52,11 +52,6 @@ _unsupported_imgopts data_file
 # bdrv_co_truncate(bs->file) call in qcow2_co_truncate(), which might succeed
 # anyway.
 
-disk_usage()
-{
-du --block-size=1 $1 | awk '{print $1}'
-}
-
 size=2100M
 
 _make_test_img -o "cluster_size=1M,preallocation=metadata" $size
diff --git a/tests/qemu-iotests/common.rc b/tests/qemu-iotests/common.rc
index 95c12577dd..5d2ea26c7f 100644
--- a/tests/qemu-iotests/common.rc
+++ b/tests/qemu-iotests/common.rc
@@ -140,6 +140,12 @@ _optstr_add()
 fi
 }
 
+# report real disk usage for sparse files
+disk_usage()
+{
+du --block-size=1 $1 | awk '{print $1}'
+}
+
 # Set the variables to the empty string to turn Valgrind off
 # for specific processes, e.g.
 # $ VALGRIND_QEMU_IO= ./check -qcow2 -valgrind 015
-- 
2.39.3




Re: [PATCH v3 0/8] qemu-img: rebase: add compression support

2023-10-16 Thread Andrey Drobyshev
On 10/2/23 09:35, Andrey Drobyshev wrote:
> On 9/19/23 20:57, Andrey Drobyshev wrote:
>> v2 --> v3:
>>  * Patch 3/8: fixed logic in the if statement, so that we align on blk
>>when blk_old_backing == NULL;
>>  * Patch 4/8: comment fix;
>>  * Patch 5/8: comment fix; dropped redundant "if (blk_new_backing)"
>>statements.
>>
>> v2: https://lists.nongnu.org/archive/html/qemu-block/2023-09/msg00448.html
>>
>> Andrey Drobyshev (8):
>>   qemu-img: rebase: stop when reaching EOF of old backing file
>>   qemu-iotests: 024: add rebasing test case for overlay_size >
>> backing_size
>>   qemu-img: rebase: use backing files' BlockBackend for buffer alignment
>>   qemu-img: add chunk size parameter to compare_buffers()
>>   qemu-img: rebase: avoid unnecessary COW operations
>>   iotests/{024, 271}: add testcases for qemu-img rebase
>>   qemu-img: add compression option to rebase subcommand
>>   iotests: add tests for "qemu-img rebase" with compression
>>
>>  docs/tools/qemu-img.rst|   6 +-
>>  qemu-img-cmds.hx   |   4 +-
>>  qemu-img.c | 136 ++
>>  tests/qemu-iotests/024 | 117 ++
>>  tests/qemu-iotests/024.out |  73 
>>  tests/qemu-iotests/271 | 131 +
>>  tests/qemu-iotests/271.out |  82 ++
>>  tests/qemu-iotests/314 | 165 +
>>  tests/qemu-iotests/314.out |  75 +
>>  9 files changed, 752 insertions(+), 37 deletions(-)
>>  create mode 100755 tests/qemu-iotests/314
>>  create mode 100644 tests/qemu-iotests/314.out
>>
> 
> Ping

Friendly ping



Re: [PATCH v3 0/8] qemu-img: rebase: add compression support

2023-10-02 Thread Andrey Drobyshev
On 9/19/23 20:57, Andrey Drobyshev wrote:
> v2 --> v3:
>  * Patch 3/8: fixed logic in the if statement, so that we align on blk
>when blk_old_backing == NULL;
>  * Patch 4/8: comment fix;
>  * Patch 5/8: comment fix; dropped redundant "if (blk_new_backing)"
>statements.
> 
> v2: https://lists.nongnu.org/archive/html/qemu-block/2023-09/msg00448.html
> 
> Andrey Drobyshev (8):
>   qemu-img: rebase: stop when reaching EOF of old backing file
>   qemu-iotests: 024: add rebasing test case for overlay_size >
> backing_size
>   qemu-img: rebase: use backing files' BlockBackend for buffer alignment
>   qemu-img: add chunk size parameter to compare_buffers()
>   qemu-img: rebase: avoid unnecessary COW operations
>   iotests/{024, 271}: add testcases for qemu-img rebase
>   qemu-img: add compression option to rebase subcommand
>   iotests: add tests for "qemu-img rebase" with compression
> 
>  docs/tools/qemu-img.rst|   6 +-
>  qemu-img-cmds.hx   |   4 +-
>  qemu-img.c | 136 ++
>  tests/qemu-iotests/024 | 117 ++
>  tests/qemu-iotests/024.out |  73 
>  tests/qemu-iotests/271 | 131 +
>  tests/qemu-iotests/271.out |  82 ++
>  tests/qemu-iotests/314 | 165 +
>  tests/qemu-iotests/314.out |  75 +
>  9 files changed, 752 insertions(+), 37 deletions(-)
>  create mode 100755 tests/qemu-iotests/314
>  create mode 100644 tests/qemu-iotests/314.out
> 

Ping



Re: [PATCH] maint: Tweak comment in mailmap to sound friendlier

2023-09-27 Thread Andrey Drobyshev
On 9/26/23 23:39, ebl...@redhat.com wrote:
> From: Eric Blake 
> 
> Documenting that we should not add new lines to work around SPF
> rewrites sounds foreboding; the intent is instead that new lines here
> are okay, but indicate a second problem elsewhere in our build process
> that we should also consider fixing at the same time, to keep the
> section from growing without bounds.
> 
> Mentioning DMARC alongside SPF may also help people grep for this
> scenario, as well as documenting the 'git config' workaround that can
> be used by submitters to avoid the munging issue in the first place.
> 
> Fixes: 3bd2608d ("maint: Add .mailmap entries for patches claiming list 
> authorship")
> CC: Andrey Drobyshev 
> Cc: Peter Maydell 
> Signed-off-by: Eric Blake 
> ---
> 
> I'm sending this email with a temporary 'git config sendemail.from
> ebl...@redhat.com', to see if the added advice actually adds the extra
> line. It did not show up in my editor window, though, so this patch
> may need further tweaking to get the instructions right.  Since I
> don't normally suffer from SPF/DMARC munging, I may not be the best
> person to test the right workaround.  Or maybe 'git config' does not
> yet have the right workaround already available as a turnkey solution
> we can suggest.
> 

The only drawback of this approach is that mail clients, as well as
tools like patchew.org now only show your "" in From/Author
field.  You can see it here:

https://patchew.org/search?q=project%3AQEMU+from%3ABlake

In your email there're 2 "From:" fields now:

> Headers...
> From: ebl...@redhat.com
> More headers...
> From: Eric Blake 
> Actual patch

Apparently, mail clients prefer to pay attention on the very first
"From:" entry, while tools like "git am" -- on the last.  If we managed
to make those entries both be in the format "name " -- that'd be
ideal.  However, as I pointed out in another thread, if we set
sendemail.from to "name ", the 2nd entry doesn't get added since
they're now identical.  So you figure out the way to get 2 identical
"From:" entries -- please let us know.

Having all that said, it would still be nice to have additional checks
for "qemu-bl...@nongnu.org" authorship, as Peter mentioned in the
previous thread.

Andrey



[PATCH] mailmap: Fix Andrey Drobyshev author email

2023-09-26 Thread andrey . drobyshev--- via
From: Andrey Drobyshev 

This fixes authorship of commits 2848289168, 52b10c9c0c as the mailing
list rewrote the "From:" field in the corresponding patches.  See commit
3bd2608db7 ("maint: Add .mailmap entries for patches claiming list
authorship") for explanation.

Signed-off-by: Andrey Drobyshev 
---
 .mailmap | 1 +
 1 file changed, 1 insertion(+)

diff --git a/.mailmap b/.mailmap
index 64ef9f4de6..04a7feb005 100644
--- a/.mailmap
+++ b/.mailmap
@@ -46,6 +46,7 @@ Ian McKellar  Ian McKellar via Qemu-devel 
 Julia Suvorova via Qemu-devel 

 Justin Terry (VM)  Justin Terry (VM) via Qemu-devel 

 Stefan Weil  Stefan Weil via 
+Andrey Drobyshev  Andrey Drobyshev via 

 
 # Next, replace old addresses by a more recent one.
 Aleksandar Markovic  

-- 
2.41.0




[PATCH v3 1/8] qemu-img: rebase: stop when reaching EOF of old backing file

2023-09-19 Thread Andrey Drobyshev via
In case when we're rebasing within one backing chain, and when target image
is larger than old backing file, bdrv_is_allocated_above() ends up setting
*pnum = 0.  As a result, target offset isn't getting incremented, and we
get stuck in an infinite for loop.  Let's detect this case and proceed
further down the loop body, as the offsets beyond the old backing size need
to be explicitly zeroed.

Signed-off-by: Andrey Drobyshev 
Reviewed-by: Denis V. Lunev 
Reviewed-by: Hanna Czenczek 
---
 qemu-img.c | 13 -
 1 file changed, 12 insertions(+), 1 deletion(-)

diff --git a/qemu-img.c b/qemu-img.c
index a48edb7101..50660ba920 100644
--- a/qemu-img.c
+++ b/qemu-img.c
@@ -3805,6 +3805,8 @@ static int img_rebase(int argc, char **argv)
 }
 
 if (prefix_chain_bs) {
+uint64_t bytes = n;
+
 /*
  * If cluster wasn't changed since prefix_chain, we don't need
  * to take action
@@ -3817,9 +3819,18 @@ static int img_rebase(int argc, char **argv)
  strerror(-ret));
 goto out;
 }
-if (!ret) {
+if (!ret && n) {
 continue;
 }
+if (!n) {
+/*
+ * If we've reached EOF of the old backing, it means that
+ * offsets beyond the old backing size were read as zeroes.
+ * Now we will need to explicitly zero the cluster in
+ * order to preserve that state after the rebase.
+ */
+n = bytes;
+}
 }
 
 /*
-- 
2.39.3




[PATCH v3 5/8] qemu-img: rebase: avoid unnecessary COW operations

2023-09-19 Thread Andrey Drobyshev via
When rebasing an image from one backing file to another, we need to
compare data from old and new backings.  If the diff between that data
happens to be unaligned to the target cluster size, we might end up
doing partial writes, which would lead to copy-on-write and additional IO.

Consider the following simple case (virtual_size == cluster_size == 64K):

base <-- inc1 <-- inc2

qemu-io -c "write -P 0xaa 0 32K" base.qcow2
qemu-io -c "write -P 0xcc 32K 32K" base.qcow2
qemu-io -c "write -P 0xbb 0 32K" inc1.qcow2
qemu-io -c "write -P 0xcc 32K 32K" inc1.qcow2
qemu-img rebase -f qcow2 -b base.qcow2 -F qcow2 inc2.qcow2

While doing rebase, we'll write a half of the cluster to inc2, and block
layer will have to read the 2nd half of the same cluster from the base image
inc1 while doing this write operation, although the whole cluster is already
read earlier to perform data comparison.

In order to avoid these unnecessary IO cycles, let's make sure every
write request is aligned to the overlay subcluster boundaries.  Using
subcluster size is universal as for the images which don't have them
this size equals to the cluster size. so in any case we end up aligning
to the smallest unit of allocation.

Signed-off-by: Andrey Drobyshev 
---
 qemu-img.c | 74 +++---
 1 file changed, 54 insertions(+), 20 deletions(-)

diff --git a/qemu-img.c b/qemu-img.c
index 0f67b021f7..a2d6241648 100644
--- a/qemu-img.c
+++ b/qemu-img.c
@@ -3523,6 +3523,7 @@ static int img_rebase(int argc, char **argv)
 uint8_t *buf_new = NULL;
 BlockDriverState *bs = NULL, *prefix_chain_bs = NULL;
 BlockDriverState *unfiltered_bs;
+BlockDriverInfo bdi = {0};
 char *filename;
 const char *fmt, *cache, *src_cache, *out_basefmt, *out_baseimg;
 int c, flags, src_flags, ret;
@@ -3533,6 +3534,7 @@ static int img_rebase(int argc, char **argv)
 bool quiet = false;
 Error *local_err = NULL;
 bool image_opts = false;
+int64_t write_align;
 
 /* Parse commandline parameters */
 fmt = NULL;
@@ -3656,6 +3658,20 @@ static int img_rebase(int argc, char **argv)
 }
 }
 
+/*
+ * We need overlay subcluster size to make sure write requests are
+ * aligned.
+ */
+ret = bdrv_get_info(unfiltered_bs, );
+if (ret < 0) {
+error_report("could not get block driver info");
+goto out;
+} else if (bdi.subcluster_size == 0) {
+bdi.subcluster_size = 1;
+}
+
+write_align = bdi.subcluster_size;
+
 /* For safe rebasing we need to compare old and new backing file */
 if (!unsafe) {
 QDict *options = NULL;
@@ -3753,7 +3769,7 @@ static int img_rebase(int argc, char **argv)
 int64_t old_backing_size = 0;
 int64_t new_backing_size = 0;
 uint64_t offset;
-int64_t n;
+int64_t n, n_old = 0, n_new = 0;
 float local_progress = 0;
 
 if (blk_old_backing && bdrv_opt_mem_align(blk_bs(blk_old_backing)) >
@@ -3799,7 +3815,8 @@ static int img_rebase(int argc, char **argv)
 }
 
 for (offset = 0; offset < size; offset += n) {
-bool buf_old_is_zero = false;
+bool old_backing_eof = false;
+int64_t n_alloc;
 
 /* How many bytes can we handle with the next read? */
 n = MIN(IO_BUF_SIZE, size - offset);
@@ -3844,33 +3861,46 @@ static int img_rebase(int argc, char **argv)
 }
 }
 
+/*
+ * At this point we know that the region [offset; offset + n)
+ * is unallocated within the target image.  This region might be
+ * unaligned to the target image's (sub)cluster boundaries, as
+ * old backing may have smaller clusters (or have subclusters).
+ * We extend it to the aligned boundaries to avoid CoW on
+ * partial writes in blk_pwrite(),
+ */
+n += offset - QEMU_ALIGN_DOWN(offset, write_align);
+offset = QEMU_ALIGN_DOWN(offset, write_align);
+n += QEMU_ALIGN_UP(offset + n, write_align) - (offset + n);
+n = MIN(n, size - offset);
+assert(!bdrv_is_allocated(unfiltered_bs, offset, n, _alloc) &&
+   n_alloc == n);
+
+/*
+ * Much like with the target image, we'll try to read as much
+ * of the old and new backings as we can.
+ */
+n_old = MIN(n, MAX(0, old_backing_size - (int64_t) offset));
+n_new = MIN(n, MAX(0, new_backing_size - (int64_t) offset));
+
 /*
  * Read old and new backing file and take into consideration that
  * backing files may be smaller than the COW image.
  */
-if (offset >= old_backing_size) {
-memset(buf_old, 0, n);
-buf_old_is_zero

  1   2   >