[PATCH v2 03/11] qcow2: put discard requests in the common queue when discard-no-unref enabled
Normally discard requests are stored in the queue attached to BDRVQcow2State to be processed later at once. Currently discard-no-unref option handling causes these requests to be processed straight away. Let's fix that. Note that when doing regular discards qcow2_free_any_cluster() would check for the presence of external data files for us and redirect request to underlying data_file. Here we want to do the same but avoid refcount updates, thus we perform the same checks. Suggested-by: Hanna Czenczek Signed-off-by: Andrey Drobyshev --- block/qcow2-cluster.c | 39 +-- 1 file changed, 29 insertions(+), 10 deletions(-) diff --git a/block/qcow2-cluster.c b/block/qcow2-cluster.c index 5f057ba2fd..7dff0bd5a1 100644 --- a/block/qcow2-cluster.c +++ b/block/qcow2-cluster.c @@ -1893,6 +1893,28 @@ again: return 0; } +/* + * Helper for adding a discard request to the queue without any refcount + * modifications. If external data file is used redirects the request to + * the corresponding BdrvChild. + */ +static inline void +discard_no_unref_any_file(BlockDriverState *bs, uint64_t offset, + uint64_t length, QCow2ClusterType ctype, + enum qcow2_discard_type dtype) +{ +BDRVQcow2State *s = bs->opaque; + +if (s->discard_passthrough[dtype] && +(ctype == QCOW2_CLUSTER_NORMAL || ctype == QCOW2_CLUSTER_ZERO_ALLOC)) { +if (has_data_file(bs)) { +bdrv_pdiscard(s->data_file, offset, length); +} else { +qcow2_queue_discard(bs, offset, length); +} +} +} + /* * This discards as many clusters of nb_clusters as possible at once (i.e. * all clusters in the same L2 slice) and returns the number of discarded @@ -1974,12 +1996,10 @@ discard_in_l2_slice(BlockDriverState *bs, uint64_t offset, uint64_t nb_clusters, if (!keep_reference) { /* Then decrease the refcount */ qcow2_free_any_cluster(bs, old_l2_entry, type); -} else if (s->discard_passthrough[type] && - (cluster_type == QCOW2_CLUSTER_NORMAL || -cluster_type == QCOW2_CLUSTER_ZERO_ALLOC)) { +} else { /* If we keep the reference, pass on the discard still */ -bdrv_pdiscard(s->data_file, old_l2_entry & L2E_OFFSET_MASK, - s->cluster_size); +discard_no_unref_any_file(bs, old_l2_entry & L2E_OFFSET_MASK, + s->cluster_size, cluster_type, type); } } @@ -2088,12 +2108,11 @@ zero_in_l2_slice(BlockDriverState *bs, uint64_t offset, if (!keep_reference) { /* Then decrease the refcount */ qcow2_free_any_cluster(bs, old_l2_entry, QCOW2_DISCARD_REQUEST); -} else if (s->discard_passthrough[QCOW2_DISCARD_REQUEST] && - (type == QCOW2_CLUSTER_NORMAL || -type == QCOW2_CLUSTER_ZERO_ALLOC)) { +} else { /* If we keep the reference, pass on the discard still */ -bdrv_pdiscard(s->data_file, old_l2_entry & L2E_OFFSET_MASK, -s->cluster_size); +discard_no_unref_any_file(bs, old_l2_entry & L2E_OFFSET_MASK, + s->cluster_size, type, + QCOW2_DISCARD_REQUEST); } } } -- 2.39.3
[PATCH v2 06/11] iotests/290: add test case to check 'discard-no-unref' option behavior
We basically fill 2 images with identical data and perform discard operations with and without 'discard-no-unref' enabled. Then we check that images still read identically, that their disk usage is the same (i.e. fallocate(FALLOC_FL_PUNCH_HOLE|FALLOC_FL_KEEP_SIZE) is called for both) and that with the option enabled cluster is still marked as allocated in "qemu-img map" output. We also check that the option doesn't work with qcow2 v2 images. Signed-off-by: Andrey Drobyshev --- tests/qemu-iotests/290 | 34 ++ tests/qemu-iotests/290.out | 28 2 files changed, 62 insertions(+) diff --git a/tests/qemu-iotests/290 b/tests/qemu-iotests/290 index 776b59de1b..4eb929d15f 100755 --- a/tests/qemu-iotests/290 +++ b/tests/qemu-iotests/290 @@ -92,6 +92,40 @@ for qcow2_compat in 0.10 1.1; do $QEMU_IMG map "$TEST_IMG" | _filter_testdir done +echo +echo "### Test 'qemu-io -c discard' with 'discard-no-unref' option enabled" +echo + +echo "# Check that qcow2 v2 images don't support 'discard-no-unref' option" +NOUNREF_IMG="$TEST_IMG.nounref" +TEST_IMG="$NOUNREF_IMG" _make_test_img -o "compat=0.10" 128k +# This should immediately fail with an error +$QEMU_IO -c 'reopen -o discard-no-unref=on' "$NOUNREF_IMG" | _filter_qemu_io + +echo "# Create two compat=1.1 images and fill them with identical data" +_make_test_img -o "compat=1.1" 128k +TEST_IMG="$NOUNREF_IMG" _make_test_img -o "compat=1.1" 128k +$QEMU_IO -c 'write -P 0xaa 0 128k' "$TEST_IMG" | _filter_qemu_io +$QEMU_IO -c 'write -P 0xaa 0 128k' "$NOUNREF_IMG" | _filter_qemu_io + +echo "# Enable 'discard-no-unref' in one of them, discard 2nd cluster in both" +$QEMU_IO -c 'discard 64k 64k' "$TEST_IMG" | _filter_qemu_io +$QEMU_IO -c 'reopen -o discard-no-unref=on' \ + -c 'discard 64k 64k' "$NOUNREF_IMG" | _filter_qemu_io + +echo "# Compare disk usage of the 2 images" +# Don't check the exact disk usage values but rather that they're equal +echo "disk_usage(`basename $TEST_IMG`) - disk_usage(`basename $NOUNREF_IMG`)" \ + "= $(( `disk_usage $TEST_IMG` - `disk_usage $NOUNREF_IMG`))" + +echo "# Check that images are still identical" +$QEMU_IMG compare "$TEST_IMG" "$NOUNREF_IMG" + +echo "# Output of qemu-img map for the image with dropped reference" +$QEMU_IMG map --output=json "$TEST_IMG" | _filter_qemu_img_map +echo "# Output of qemu-img map for the image with kept reference" +$QEMU_IMG map --output=json "$NOUNREF_IMG" | _filter_qemu_img_map + # success, all done echo "*** done" rm -f $seq.full diff --git a/tests/qemu-iotests/290.out b/tests/qemu-iotests/290.out index 22b476594f..f790feae81 100644 --- a/tests/qemu-iotests/290.out +++ b/tests/qemu-iotests/290.out @@ -58,4 +58,32 @@ read 131072/131072 bytes at offset 0 128 KiB, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec) # Output of qemu-img map Offset Length Mapped to File + +### Test 'qemu-io -c discard' with 'discard-no-unref' option enabled + +# Check that qcow2 v2 images don't support 'discard-no-unref' option +Formatting 'TEST_DIR/t.IMGFMT.nounref', fmt=IMGFMT size=131072 +qemu-io: discard-no-unref is only supported since qcow2 version 3 +# Create two compat=1.1 images and fill them with identical data +Formatting 'TEST_DIR/t.IMGFMT', fmt=IMGFMT size=131072 +Formatting 'TEST_DIR/t.IMGFMT.nounref', fmt=IMGFMT size=131072 +wrote 131072/131072 bytes at offset 0 +128 KiB, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec) +wrote 131072/131072 bytes at offset 0 +128 KiB, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec) +# Enable 'discard-no-unref' in one of them, discard 2nd cluster in both +discard 65536/65536 bytes at offset 65536 +64 KiB, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec) +discard 65536/65536 bytes at offset 65536 +64 KiB, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec) +# Compare disk usage of the 2 images +disk_usage(t.qcow2) - disk_usage(t.qcow2.nounref) = 0 +# Check that images are still identical +Images are identical. +# Output of qemu-img map for the image with dropped reference +[{ "start": 0, "length": 65536, "depth": 0, "present": true, "zero": false, "data": true, "compressed": false, "offset": OFFSET}, +{ "start": 65536, "length": 65536, "depth": 0, "present": true, "zero": true, "data": false, "compressed": false}] +# Output of qemu-img map for the image with kept reference +[{ "start": 0, "length": 65536, "depth": 0, "present": true, "zero": false, "data": true, "compressed": false, "offset": OFFSET}, +{ "start": 65536, "length": 65536, "depth": 0, "present": true, "zero": true, "data": false, "compressed": false, "offset": OFFSET}] *** done -- 2.39.3
[PATCH v2 00/11] qcow2: make subclusters discardable
v1: https://lists.nongnu.org/archive/html/qemu-devel/2023-10/msg07223.html Andrey Drobyshev (11): qcow2: make function update_refcount_discard() global qcow2: simplify L2 entries accounting for discard-no-unref qcow2: put discard requests in the common queue when discard-no-unref enabled block/file-posix: add trace event for fallocate() calls iotests/common.rc: add disk_usage function iotests/290: add test case to check 'discard-no-unref' option behavior qcow2: add get_sc_range_info() helper for working with subcluster ranges qcow2: zeroize the entire cluster when there're no non-zero subclusters qcow2: make subclusters discardable qcow2: zero_l2_subclusters: fall through to discard operation when requested iotests/271: add test cases for subcluster-based discard/unmap block/file-posix.c | 1 + block/qcow2-cluster.c| 346 --- block/qcow2-refcount.c | 8 +- block/qcow2-snapshot.c | 6 +- block/qcow2.c| 25 +-- block/qcow2.h| 6 +- block/trace-events | 1 + tests/qemu-iotests/250 | 5 - tests/qemu-iotests/271 | 72 ++-- tests/qemu-iotests/271.out | 69 ++- tests/qemu-iotests/290 | 34 tests/qemu-iotests/290.out | 28 +++ tests/qemu-iotests/common.rc | 6 + 13 files changed, 490 insertions(+), 117 deletions(-) -- 2.39.3
[PATCH v2 11/11] iotests/271: add test cases for subcluster-based discard/unmap
Add a bunch of test cases covering new subclusters behaviour: unmap of last allocated subclusters; unmap of subclusters within unallocated cluster; discard of unallocated subclusters within a cluster; regular discard of subclusters within a cluster; discard of last allocated subclusters. Also make all discard/unmap operations enable trace point 'file_do_fallocate' so that actual fallocate() calls are visible. Signed-off-by: Andrey Drobyshev --- tests/qemu-iotests/271 | 70 +- tests/qemu-iotests/271.out | 69 ++--- 2 files changed, 119 insertions(+), 20 deletions(-) diff --git a/tests/qemu-iotests/271 b/tests/qemu-iotests/271 index 04c57813c2..8b80682cff 100755 --- a/tests/qemu-iotests/271 +++ b/tests/qemu-iotests/271 @@ -81,6 +81,12 @@ _verify_l2_bitmap() fi } +# Filter out trace params which we don't control (file fd) +_filter_trace_fallocate() +{ +sed -E 's/fd=[0-9]+/fd=N/g' +} + # This should be called as _run_test c=XXX sc=XXX off=XXX len=XXX cmd=XXX # c: cluster number (0 if unset) # sc: subcluster number inside cluster @c (0 if unset) @@ -94,12 +100,13 @@ _verify_l2_bitmap() # discard -> discard _run_test() { -unset c sc off len cmd +unset c sc off len cmd opt for var in "$@"; do eval "$var"; done case "${cmd:-write}" in zero) cmd="write -q -z";; unmap) +opt="--trace enable=file_do_fallocate" cmd="write -q -z -u";; compress) pat=$((${pat:-0} + 1)) @@ -108,6 +115,7 @@ _run_test() pat=$((${pat:-0} + 1)) cmd="write -q -P ${pat}";; discard) +opt="--trace enable=file_do_fallocate" cmd="discard -q";; *) echo "Unknown option $cmd" @@ -121,7 +129,7 @@ _run_test() cmd="$cmd ${offset} ${len}" raw_cmd=$(echo $cmd | sed s/-c//) # Raw images don't support -c echo $cmd | sed 's/-P [0-9][0-9]\?/-P PATTERN/' -$QEMU_IO -c "$cmd" "$TEST_IMG" | _filter_qemu_io +$QEMU_IO $opt -c "$cmd" "$TEST_IMG" 2>&1 | _filter_qemu_io | _filter_trace_fallocate $QEMU_IO -c "$raw_cmd" -f raw "$TEST_IMG.raw" | _filter_qemu_io _verify_img _verify_l2_bitmap "$c" @@ -202,9 +210,10 @@ for use_backing_file in yes no; do alloc="$(seq 16 31)"; zero="$(seq 0 15)" _run_test sc=0 len=32k cmd=unmap -### Zero and unmap cluster #0 +### Zero and unmap second half of cluster #0 (this will unmap it and +### discard l2 entry) alloc=""; zero="$(seq 0 31)" -_run_test sc=0 len=64k cmd=unmap +_run_test sc=16 len=32k cmd=unmap ### Write subcluster #1 (middle of subcluster) alloc="1"; zero="0 $(seq 2 31)" @@ -439,6 +448,12 @@ for use_backing_file in yes no; do _verify_l2_bitmap 16 _verify_l2_bitmap 17 +# Unmap subclusters #0-#3 of an unallocated cluster #8. Since +# 'write -z -u' doesn't lead to full discard, subclusters should become +# explicitly zeroized +alloc=""; zero="$(seq 0 3)" +_run_test c=8 sc=0 len=8k cmd=unmap + # Cluster-aligned request from clusters #9 to #11 alloc=""; zero="$(seq 0 31)" _run_test c=9 sc=0 len=192k cmd=unmap @@ -523,26 +538,45 @@ for use_backing_file in yes no; do echo echo "### Discarding clusters with non-zero bitmaps (backing file: $use_backing_file) ###" echo + +_reset_img 1M + +# Write first half of cluster #0 and discard another half +alloc="$(seq 0 15)"; zero="" +_run_test sc=0 len=32k +# When discarding unallocated subclusters, they only have to be +# explicitly zeroized when the image has a backing file if [ "$use_backing_file" = "yes" ]; then -_make_test_img -o extended_l2=on -F raw -b "$TEST_IMG.base" 1M +alloc="$(seq 0 15)"; zero="$(seq 16 31)" else -_make_test_img -o extended_l2=on 1M +alloc="$(seq 0 15)"; zero="" fi -# Write clusters #0-#2 and then discard them -$QEMU_IO -c 'write -q 0 128k' "$TEST_IMG" -$QEMU_IO -c 'discard -q 0 128k' "$TEST_IMG" +_run_test sc=16 len=32k cmd=discard + +# Write cluster #0 and discard its subclusters #0-#3 +alloc="$(seq 0 31)"; zero="" +_run_test sc=0 len=64k +alloc="$(seq 4 31)"; zero="$(seq 0 3)" +_run_test sc=0 len=8k cmd=discard + +# Discard remaining subclusters #4-#32. This should unmap the cluster +# and discard its l2 entry +alloc=""; zero="$(seq 0 31)" +_r
[PATCH v2 07/11] qcow2: add get_sc_range_info() helper for working with subcluster ranges
This helper simply obtains the l2 table parameters of the cluster which contains the given subclusters range. Right now this info is being obtained and used by zero_l2_subclusters(). As we're about to introduce the subclusters discard operation, this helper would let us avoid code duplication. Also introduce struct SubClusterRangeInfo, which would contain all the needed params. Signed-off-by: Andrey Drobyshev --- block/qcow2-cluster.c | 140 -- 1 file changed, 108 insertions(+), 32 deletions(-) diff --git a/block/qcow2-cluster.c b/block/qcow2-cluster.c index 7dff0bd5a1..475f167035 100644 --- a/block/qcow2-cluster.c +++ b/block/qcow2-cluster.c @@ -1915,6 +1915,103 @@ discard_no_unref_any_file(BlockDriverState *bs, uint64_t offset, } } +/* + * Structure containing info about the subclusters range within one cluster. + * + * Since @l2_slice is a strong reference to the l2 table slice containing + * the corresponding l2 entry, it must be explicitly released by + * qcow2_cache_put(). Thus the user must either declare it with g_auto() + * (in which case sc_range_info_cleanup() is called automatically) or do + * the cleanup themselves. + */ +typedef struct SubClusterRangeInfo { +uint64_t *l2_slice; +int l2_index; +uint64_t l2_entry; +uint64_t l2_bitmap; +QCow2ClusterType ctype; +Qcow2Cache *l2_table_cache; +} SubClusterRangeInfo; + +static void sc_range_info_cleanup(SubClusterRangeInfo *scri) +{ +if (scri->l2_table_cache && scri->l2_slice) { +qcow2_cache_put(scri->l2_table_cache, (void **) >l2_slice); +} +} + +G_DEFINE_AUTO_CLEANUP_CLEAR_FUNC(SubClusterRangeInfo, sc_range_info_cleanup); + +/* + * For a given @offset and @nb_subclusters, fill out the SubClusterRangeInfo + * structure describing the subclusters range and referred to by @scri. + * Only the subclusters which can be independently discarded/zeroized + * (i.e. not compressed or invalid) are considered to be valid here. + * + * The subclusters range is denoted by @offset and @nb_subclusters and must + * not cross the cluster boundary. @offset must be aligned to the subcluster + * size. + * + * Return: 0 if the SubClusterRangeInfo is successfully filled out and the + * subclusters within the given range might be discarded/zeroized; + * -EINVAL if any of the subclusters within the range is invalid; + * -ENOTSUP if the range is contained within a compressed cluster. + */ +static int GRAPH_RDLOCK +get_sc_range_info(BlockDriverState *bs, uint64_t offset, + unsigned nb_subclusters, SubClusterRangeInfo *scri) +{ +BDRVQcow2State *s = bs->opaque; +int ret, sc_cleared, sc_index = offset_to_sc_index(s, offset); +QCow2SubclusterType sctype; + +/* Here we only work with the subclusters within single cluster. */ +assert(nb_subclusters > 0 && nb_subclusters < s->subclusters_per_cluster); +assert(sc_index + nb_subclusters <= s->subclusters_per_cluster); +assert(offset_into_subcluster(s, offset) == 0); + +scri->l2_table_cache = s->l2_table_cache; + +ret = get_cluster_table(bs, offset, >l2_slice, >l2_index); +if (ret < 0) { +goto cleanup; +} + +scri->l2_entry = get_l2_entry(s, scri->l2_slice, scri->l2_index); +scri->l2_bitmap = get_l2_bitmap(s, scri->l2_slice, scri->l2_index); +scri->ctype = qcow2_get_cluster_type(bs, scri->l2_entry); + +sc_cleared = 0; +do { +ret = qcow2_get_subcluster_range_type( +bs, scri->l2_entry, scri->l2_bitmap, sc_index + sc_cleared, +); +if (ret < 0) { +goto cleanup; +} + +switch (sctype) { +case QCOW2_SUBCLUSTER_COMPRESSED: +/* We cannot partially zeroize/discard compressed clusters. */ +ret = -ENOTSUP; +goto cleanup; +case QCOW2_SUBCLUSTER_INVALID: +ret = -EINVAL; +goto cleanup; +default: +break; +} + +sc_cleared += ret; +} while (sc_cleared < nb_subclusters); + +return 0; + +cleanup: +sc_range_info_cleanup(scri); +return ret; +} + /* * This discards as many clusters of nb_clusters as possible at once (i.e. * all clusters in the same L2 slice) and returns the number of discarded @@ -2127,46 +2224,25 @@ zero_l2_subclusters(BlockDriverState *bs, uint64_t offset, unsigned nb_subclusters) { BDRVQcow2State *s = bs->opaque; -uint64_t *l2_slice; -uint64_t old_l2_bitmap, l2_bitmap; -int l2_index, ret, sc = offset_to_sc_index(s, offset); - -/* For full clusters use zero_in_l2_slice() instead */ -assert(nb_subclusters > 0 && nb_subclusters < s->subclusters_per_cluster); -assert(sc + nb_subclusters <= s->subclusters_per_cluster); -assert(offset_into_subcluster(s, offset) == 0); +uint64_t
[PATCH v2 05/11] iotests/common.rc: add disk_usage function
Move the definition from iotests/250 to common.rc. This is used to detect real disk usage of sparse files. In particular, we want to use it for checking subclusters-based discards. Signed-off-by: Andrey Drobyshev --- tests/qemu-iotests/250 | 5 - tests/qemu-iotests/common.rc | 6 ++ 2 files changed, 6 insertions(+), 5 deletions(-) diff --git a/tests/qemu-iotests/250 b/tests/qemu-iotests/250 index af48f83aba..c0a0dbc0ff 100755 --- a/tests/qemu-iotests/250 +++ b/tests/qemu-iotests/250 @@ -52,11 +52,6 @@ _unsupported_imgopts data_file # bdrv_co_truncate(bs->file) call in qcow2_co_truncate(), which might succeed # anyway. -disk_usage() -{ -du --block-size=1 $1 | awk '{print $1}' -} - size=2100M _make_test_img -o "cluster_size=1M,preallocation=metadata" $size diff --git a/tests/qemu-iotests/common.rc b/tests/qemu-iotests/common.rc index 95c12577dd..237f746af8 100644 --- a/tests/qemu-iotests/common.rc +++ b/tests/qemu-iotests/common.rc @@ -140,6 +140,12 @@ _optstr_add() fi } +# report real disk usage for sparse files +disk_usage() +{ +du --block-size=1 "$1" | awk '{print $1}' +} + # Set the variables to the empty string to turn Valgrind off # for specific processes, e.g. # $ VALGRIND_QEMU_IO= ./check -qcow2 -valgrind 015 -- 2.39.3
[PATCH v2 09/11] qcow2: make subclusters discardable
This commit makes the discard operation work on the subcluster level rather than cluster level. It introduces discard_l2_subclusters() function and makes use of it in qcow2 discard implementation, much like it's done with zero_in_l2_slice() / zero_l2_subclusters(). It also changes the qcow2 driver pdiscard_alignment to subcluster_size. That way subcluster-aligned discards lead to actual fallocate(PUNCH_HOLE) operation and free host disk space. This feature will let us gain additional disk space on guest TRIM/discard requests, especially when using large enough clusters (1M, 2M) with subclusters enabled. Also rename qcow2_cluster_discard() -> qcow2_subcluster_discard() to reflect the change. Signed-off-by: Andrey Drobyshev --- block/qcow2-cluster.c | 106 + block/qcow2-snapshot.c | 6 +-- block/qcow2.c | 25 +- block/qcow2.h | 4 +- tests/qemu-iotests/271 | 2 +- 5 files changed, 117 insertions(+), 26 deletions(-) diff --git a/block/qcow2-cluster.c b/block/qcow2-cluster.c index 8d39e2f960..3c134a7e80 100644 --- a/block/qcow2-cluster.c +++ b/block/qcow2-cluster.c @@ -2105,25 +2105,106 @@ discard_in_l2_slice(BlockDriverState *bs, uint64_t offset, uint64_t nb_clusters, return nb_clusters; } -int qcow2_cluster_discard(BlockDriverState *bs, uint64_t offset, - uint64_t bytes, enum qcow2_discard_type type, - bool full_discard) +static int coroutine_fn GRAPH_RDLOCK +discard_l2_subclusters(BlockDriverState *bs, uint64_t offset, + uint64_t nb_subclusters, + enum qcow2_discard_type type, + bool full_discard) +{ +BDRVQcow2State *s = bs->opaque; +uint64_t new_l2_bitmap, bitmap_alloc_mask, bitmap_zero_mask; +int ret, sc = offset_to_sc_index(s, offset); +g_auto(SubClusterRangeInfo) scri = { 0 }; + +ret = get_sc_range_info(bs, offset, nb_subclusters, ); +if (ret < 0) { +return ret; +} + +new_l2_bitmap = scri.l2_bitmap; +bitmap_alloc_mask = QCOW_OFLAG_SUB_ALLOC_RANGE(sc, sc + nb_subclusters); +bitmap_zero_mask = QCOW_OFLAG_SUB_ZERO_RANGE(sc, sc + nb_subclusters); + +new_l2_bitmap &= ~bitmap_alloc_mask; + +/* + * Full discard means we fall through to the backing file, thus we need + * to mark the subclusters as deallocated and clear the corresponding + * zero bits. + * + * Non-full discard means subclusters should be explicitly marked as + * zeroes. In this case QCOW2 specification requires the corresponding + * allocation status bits to be unset as well. If the subclusters are + * deallocated in the first place and there's no backing, the operation + * can be skipped. + */ +if (full_discard) { +new_l2_bitmap &= ~bitmap_zero_mask; +} else if (bs->backing || scri.l2_bitmap & bitmap_alloc_mask) { +new_l2_bitmap |= bitmap_zero_mask; +} + +/* + * If after discarding this range there won't be any allocated subclusters + * left, and new bitmap becomes the same as it'd be after discarding the + * whole cluster, we better discard it entirely. That way we'd also + * update the refcount table. + */ +if ((full_discard && new_l2_bitmap == 0) || +(!full_discard && new_l2_bitmap == QCOW_L2_BITMAP_ALL_ZEROES)) { +return discard_in_l2_slice( +bs, QEMU_ALIGN_DOWN(offset, s->cluster_size), +1, type, full_discard); +} + +if (scri.l2_bitmap != new_l2_bitmap) { +set_l2_bitmap(s, scri.l2_slice, scri.l2_index, new_l2_bitmap); +qcow2_cache_entry_mark_dirty(s->l2_table_cache, scri.l2_slice); +} + +discard_no_unref_any_file( +bs, (scri.l2_entry & L2E_OFFSET_MASK) + offset_into_cluster(s, offset), +nb_subclusters * s->subcluster_size, scri.ctype, type); + +return 0; +} + +int qcow2_subcluster_discard(BlockDriverState *bs, uint64_t offset, + uint64_t bytes, enum qcow2_discard_type type, + bool full_discard) { BDRVQcow2State *s = bs->opaque; uint64_t end_offset = offset + bytes; uint64_t nb_clusters; +unsigned head, tail; int64_t cleared; int ret; /* Caller must pass aligned values, except at image end */ -assert(QEMU_IS_ALIGNED(offset, s->cluster_size)); -assert(QEMU_IS_ALIGNED(end_offset, s->cluster_size) || +assert(QEMU_IS_ALIGNED(offset, s->subcluster_size)); +assert(QEMU_IS_ALIGNED(end_offset, s->subcluster_size) || end_offset == bs->total_sectors << BDRV_SECTOR_BITS); -nb_clusters = size_to_clusters(s, bytes); +head = MIN(end_offset, ROUND_UP(offset, s->cluster_size)) - offset; +offset += head; + +tail = (end_offset >= bs->total_sectors << BDRV_SECTOR_BITS) ? 0 : +
[PATCH v2 02/11] qcow2: simplify L2 entries accounting for discard-no-unref
Commits 42a2890a and b2b10904 introduce handling of discard-no-unref option in discard_in_l2_slice() and zero_in_l2_slice(). They add even more if's when chosing the right l2 entry. What we really need for this option is the new entry simply to contain the same host cluster offset, no matter whether we unmap or zeroize the cluster. For that OR'ing with the old entry is enough. This patch doesn't change the logic and is pure refactoring. Signed-off-by: Andrey Drobyshev --- block/qcow2-cluster.c | 34 +++--- 1 file changed, 15 insertions(+), 19 deletions(-) diff --git a/block/qcow2-cluster.c b/block/qcow2-cluster.c index ce8c0076b3..5f057ba2fd 100644 --- a/block/qcow2-cluster.c +++ b/block/qcow2-cluster.c @@ -1946,25 +1946,21 @@ discard_in_l2_slice(BlockDriverState *bs, uint64_t offset, uint64_t nb_clusters, new_l2_entry = new_l2_bitmap = 0; } else if (bs->backing || qcow2_cluster_is_allocated(cluster_type)) { if (has_subclusters(s)) { -if (keep_reference) { -new_l2_entry = old_l2_entry; -} else { -new_l2_entry = 0; -} +new_l2_entry = 0; new_l2_bitmap = QCOW_L2_BITMAP_ALL_ZEROES; } else { -if (s->qcow_version >= 3) { -if (keep_reference) { -new_l2_entry |= QCOW_OFLAG_ZERO; -} else { -new_l2_entry = QCOW_OFLAG_ZERO; -} -} else { -new_l2_entry = 0; -} +new_l2_entry = s->qcow_version >= 3 ? QCOW_OFLAG_ZERO : 0; } } +/* + * No need to check for the QCOW version since discard-no-unref is + * only allowed since version 3. + */ +if (keep_reference) { +new_l2_entry |= old_l2_entry; +} + if (old_l2_entry == new_l2_entry && old_l2_bitmap == new_l2_bitmap) { continue; } @@ -2064,19 +2060,19 @@ zero_in_l2_slice(BlockDriverState *bs, uint64_t offset, ((flags & BDRV_REQ_MAY_UNMAP) && qcow2_cluster_is_allocated(type)); bool keep_reference = (s->discard_no_unref && type != QCOW2_CLUSTER_COMPRESSED); -uint64_t new_l2_entry = old_l2_entry; +uint64_t new_l2_entry = unmap ? 0 : old_l2_entry; uint64_t new_l2_bitmap = old_l2_bitmap; -if (unmap && !keep_reference) { -new_l2_entry = 0; -} - if (has_subclusters(s)) { new_l2_bitmap = QCOW_L2_BITMAP_ALL_ZEROES; } else { new_l2_entry |= QCOW_OFLAG_ZERO; } +if (keep_reference) { +new_l2_entry |= old_l2_entry; +} + if (old_l2_entry == new_l2_entry && old_l2_bitmap == new_l2_bitmap) { continue; } -- 2.39.3
[PATCH v2 04/11] block/file-posix: add trace event for fallocate() calls
This would ease debugging of write zeroes and discard operations. Signed-off-by: Andrey Drobyshev --- block/file-posix.c | 1 + block/trace-events | 1 + 2 files changed, 2 insertions(+) diff --git a/block/file-posix.c b/block/file-posix.c index 35684f7e21..45134f0eef 100644 --- a/block/file-posix.c +++ b/block/file-posix.c @@ -1859,6 +1859,7 @@ static int translate_err(int err) static int do_fallocate(int fd, int mode, off_t offset, off_t len) { do { +trace_file_do_fallocate(fd, mode, offset, len); if (fallocate(fd, mode, offset, len) == 0) { return 0; } diff --git a/block/trace-events b/block/trace-events index 8e789e1f12..2f7ad28996 100644 --- a/block/trace-events +++ b/block/trace-events @@ -203,6 +203,7 @@ curl_setup_preadv(uint64_t bytes, uint64_t start, const char *range) "reading %" curl_close(void) "close" # file-posix.c +file_do_fallocate(int fd, int mode, int64_t offset, int64_t len) "fd=%d mode=0x%02x offset=%" PRIi64 " len=%" PRIi64 file_copy_file_range(void *bs, int src, int64_t src_off, int dst, int64_t dst_off, int64_t bytes, int flags, int64_t ret) "bs %p src_fd %d offset %"PRIu64" dst_fd %d offset %"PRIu64" bytes %"PRIu64" flags %d ret %"PRId64 file_FindEjectableOpticalMedia(const char *media) "Matching using %s" file_setup_cdrom(const char *partition) "Using %s as optical disc" -- 2.39.3
[PATCH v2 10/11] qcow2: zero_l2_subclusters: fall through to discard operation when requested
When zeroizing subclusters within single cluster, detect usage of the BDRV_REQ_MAY_UNMAP flag and fall through to the subcluster-based discard operation, much like it's done with the cluster-based discards. That way subcluster-aligned operations "qemu-io -c 'write -z -u ...'" will lead to actual unmap. Signed-off-by: Andrey Drobyshev --- block/qcow2-cluster.c | 26 -- 1 file changed, 20 insertions(+), 6 deletions(-) diff --git a/block/qcow2-cluster.c b/block/qcow2-cluster.c index 3c134a7e80..53e04eff93 100644 --- a/block/qcow2-cluster.c +++ b/block/qcow2-cluster.c @@ -2107,15 +2107,16 @@ discard_in_l2_slice(BlockDriverState *bs, uint64_t offset, uint64_t nb_clusters, static int coroutine_fn GRAPH_RDLOCK discard_l2_subclusters(BlockDriverState *bs, uint64_t offset, - uint64_t nb_subclusters, - enum qcow2_discard_type type, - bool full_discard) + uint64_t nb_subclusters, enum qcow2_discard_type type, + bool full_discard, bool uncond_zeroize) { BDRVQcow2State *s = bs->opaque; uint64_t new_l2_bitmap, bitmap_alloc_mask, bitmap_zero_mask; int ret, sc = offset_to_sc_index(s, offset); g_auto(SubClusterRangeInfo) scri = { 0 }; +assert(!(full_discard && uncond_zeroize)); + ret = get_sc_range_info(bs, offset, nb_subclusters, ); if (ret < 0) { return ret; @@ -2140,7 +2141,8 @@ discard_l2_subclusters(BlockDriverState *bs, uint64_t offset, */ if (full_discard) { new_l2_bitmap &= ~bitmap_zero_mask; -} else if (bs->backing || scri.l2_bitmap & bitmap_alloc_mask) { +} else if (uncond_zeroize || bs->backing || + scri.l2_bitmap & bitmap_alloc_mask) { new_l2_bitmap |= bitmap_zero_mask; } @@ -2197,7 +2199,7 @@ int qcow2_subcluster_discard(BlockDriverState *bs, uint64_t offset, if (head) { ret = discard_l2_subclusters(bs, offset - head, size_to_subclusters(s, head), type, - full_discard); + full_discard, false); if (ret < 0) { goto fail; } @@ -2221,7 +2223,7 @@ int qcow2_subcluster_discard(BlockDriverState *bs, uint64_t offset, if (tail) { ret = discard_l2_subclusters(bs, end_offset, size_to_subclusters(s, tail), type, - full_discard); + full_discard, false); if (ret < 0) { goto fail; } @@ -2318,6 +2320,18 @@ zero_l2_subclusters(BlockDriverState *bs, uint64_t offset, int ret, sc = offset_to_sc_index(s, offset); g_auto(SubClusterRangeInfo) scri = { 0 }; +/* + * If the request allows discarding subclusters we go down the discard + * path regardless of their allocation status. After the discard + * operation with full_discard=false subclusters are going to be read as + * zeroes anyway. But we make sure that subclusters are explicitly + * zeroed anyway with uncond_zeroize=true. + */ +if (flags & BDRV_REQ_MAY_UNMAP) { +return discard_l2_subclusters(bs, offset, nb_subclusters, + QCOW2_DISCARD_REQUEST, false, true); +} + ret = get_sc_range_info(bs, offset, nb_subclusters, ); if (ret < 0) { return ret; -- 2.39.3
[PATCH v2 01/11] qcow2: make function update_refcount_discard() global
We are going to need it for discarding separate subclusters. The function itself doesn't do anything with the refcount tables, it simply adds a discard request to the queue, so rename it to qcow2_queue_discard(). Signed-off-by: Andrey Drobyshev Reviewed-by: Hanna Czenczek --- block/qcow2-refcount.c | 8 block/qcow2.h | 2 ++ 2 files changed, 6 insertions(+), 4 deletions(-) diff --git a/block/qcow2-refcount.c b/block/qcow2-refcount.c index 0266542cee..2026f5fa21 100644 --- a/block/qcow2-refcount.c +++ b/block/qcow2-refcount.c @@ -754,8 +754,8 @@ void qcow2_process_discards(BlockDriverState *bs, int ret) } } -static void update_refcount_discard(BlockDriverState *bs, -uint64_t offset, uint64_t length) +void qcow2_queue_discard(BlockDriverState *bs, uint64_t offset, + uint64_t length) { BDRVQcow2State *s = bs->opaque; Qcow2DiscardRegion *d, *p, *next; @@ -902,7 +902,7 @@ update_refcount(BlockDriverState *bs, int64_t offset, int64_t length, } if (s->discard_passthrough[type]) { -update_refcount_discard(bs, cluster_offset, s->cluster_size); +qcow2_queue_discard(bs, cluster_offset, s->cluster_size); } } } @@ -3619,7 +3619,7 @@ qcow2_discard_refcount_block(BlockDriverState *bs, uint64_t discard_block_offs) /* discard refblock from the cache if refblock is cached */ qcow2_cache_discard(s->refcount_block_cache, refblock); } -update_refcount_discard(bs, discard_block_offs, s->cluster_size); +qcow2_queue_discard(bs, discard_block_offs, s->cluster_size); return 0; } diff --git a/block/qcow2.h b/block/qcow2.h index a9e3481c6e..197bdcdf53 100644 --- a/block/qcow2.h +++ b/block/qcow2.h @@ -891,6 +891,8 @@ int coroutine_fn qcow2_check_refcounts(BlockDriverState *bs, BdrvCheckResult *re BdrvCheckMode fix); void GRAPH_RDLOCK qcow2_process_discards(BlockDriverState *bs, int ret); +void qcow2_queue_discard(BlockDriverState *bs, uint64_t offset, + uint64_t length); int GRAPH_RDLOCK qcow2_check_metadata_overlap(BlockDriverState *bs, int ign, int64_t offset, -- 2.39.3
[PATCH v2 08/11] qcow2: zeroize the entire cluster when there're no non-zero subclusters
When zeroizing the last non-zero subclusters within single cluster, it makes sense to go zeroize the entire cluster and go down zero_in_l2_slice() path right away. That way we'd also update the corresponding refcount table. Signed-off-by: Andrey Drobyshev Reviewed-by: Hanna Czenczek --- block/qcow2-cluster.c | 17 ++--- 1 file changed, 14 insertions(+), 3 deletions(-) diff --git a/block/qcow2-cluster.c b/block/qcow2-cluster.c index 475f167035..8d39e2f960 100644 --- a/block/qcow2-cluster.c +++ b/block/qcow2-cluster.c @@ -2221,7 +2221,7 @@ zero_in_l2_slice(BlockDriverState *bs, uint64_t offset, static int coroutine_fn GRAPH_RDLOCK zero_l2_subclusters(BlockDriverState *bs, uint64_t offset, -unsigned nb_subclusters) +unsigned nb_subclusters, int flags) { BDRVQcow2State *s = bs->opaque; uint64_t new_l2_bitmap; @@ -2237,6 +2237,16 @@ zero_l2_subclusters(BlockDriverState *bs, uint64_t offset, new_l2_bitmap |= QCOW_OFLAG_SUB_ZERO_RANGE(sc, sc + nb_subclusters); new_l2_bitmap &= ~QCOW_OFLAG_SUB_ALLOC_RANGE(sc, sc + nb_subclusters); +/* + * If there're no non-zero subclusters left, we might as well zeroize + * the entire cluster. That way we'd also update the refcount table. + */ +if ((new_l2_bitmap & QCOW_L2_BITMAP_ALL_ZEROES) == +QCOW_L2_BITMAP_ALL_ZEROES) { +return zero_in_l2_slice(bs, QEMU_ALIGN_DOWN(offset, s->cluster_size), +1, flags); +} + if (new_l2_bitmap != scri.l2_bitmap) { set_l2_bitmap(s, scri.l2_slice, scri.l2_index, new_l2_bitmap); qcow2_cache_entry_mark_dirty(s->l2_table_cache, scri.l2_slice); @@ -2293,7 +2303,7 @@ int coroutine_fn qcow2_subcluster_zeroize(BlockDriverState *bs, uint64_t offset, if (head) { ret = zero_l2_subclusters(bs, offset - head, - size_to_subclusters(s, head)); + size_to_subclusters(s, head), flags); if (ret < 0) { goto fail; } @@ -2314,7 +2324,8 @@ int coroutine_fn qcow2_subcluster_zeroize(BlockDriverState *bs, uint64_t offset, } if (tail) { -ret = zero_l2_subclusters(bs, end_offset, size_to_subclusters(s, tail)); +ret = zero_l2_subclusters(bs, end_offset, + size_to_subclusters(s, tail), flags); if (ret < 0) { goto fail; } -- 2.39.3
Re: [BUG, RFC] Base node is in RW after making external snapshot
On 4/24/24 21:00, Andrey Drobyshev wrote: > Hi everyone, > > When making an external snapshot, we end up in a situation when 2 block > graph nodes related to the same image file (format and storage nodes) > have different RO flags set on them. > > E.g. > > # ls -la /proc/PID/fd > lrwx-- 1 root qemu 64 Apr 24 20:14 12 -> /path/to/harddisk.hdd > > # virsh qemu-monitor-command VM '{"execute": "query-named-block-nodes"}' > --pretty | egrep '"node-name"|"ro"' > "ro": false, > "node-name": "libvirt-1-format", > "ro": false, > "node-name": "libvirt-1-storage", > > # virsh snapshot-create-as VM --name snap --disk-only > Domain snapshot snap created > > # ls -la /proc/PID/fd > lr-x-- 1 root qemu 64 Apr 24 20:14 134 -> /path/to/harddisk.hdd > lrwx-- 1 root qemu 64 Apr 24 20:14 135 -> /path/to/harddisk.snap > > # virsh qemu-monitor-command VM '{"execute": "query-named-block-nodes"}' > --pretty | egrep '"node-name"|"ro"' > "ro": false, > "node-name": "libvirt-2-format", > "ro": false, > "node-name": "libvirt-2-storage", > "ro": true, > "node-name": "libvirt-1-format", > "ro": false,<-- > "node-name": "libvirt-1-storage", > > File descriptor has been reopened in RO, but "libvirt-1-storage" node > still has RW permissions set. > > I'm wondering it this a bug or this is intended? Looks like a bug to > me, although I see that some iotests (e.g. 273) expect 2 nodes related > to the same image file to have different RO flags. > > bdrv_reopen_set_read_only() > bdrv_reopen() > bdrv_reopen_queue() > bdrv_reopen_queue_child() > bdrv_reopen_multiple() > bdrv_list_refresh_perms() > bdrv_topological_dfs() > bdrv_do_refresh_perms() > bdrv_reopen_commit() > > In the stack above bdrv_reopen_set_read_only() is only being called for > the parent (libvirt-1-format) node. There're 2 lists: BDSs from > refresh_list are used by bdrv_drv_set_perm and this leads to actual > reopen with RO of the file descriptor. And then there's reopen queue > bs_queue -- BDSs from this queue get their parameters updated. While > refresh_list ends up having the whole subtree (including children, this > is done in bdrv_topological_dfs()) bs_queue only has the parent. And > that is because storage (child) node's (bs->inherits_from == NULL), so > bdrv_reopen_queue_child() never adds it to the queue. Could it be the > source of this bug? > > Anyway, would greatly appreciate a clarification. > > Andrey Friendly ping. Could somebody confirm that it is a bug indeed?
[BUG, RFC] Base node is in RW after making external snapshot
Hi everyone, When making an external snapshot, we end up in a situation when 2 block graph nodes related to the same image file (format and storage nodes) have different RO flags set on them. E.g. # ls -la /proc/PID/fd lrwx-- 1 root qemu 64 Apr 24 20:14 12 -> /path/to/harddisk.hdd # virsh qemu-monitor-command VM '{"execute": "query-named-block-nodes"}' --pretty | egrep '"node-name"|"ro"' "ro": false, "node-name": "libvirt-1-format", "ro": false, "node-name": "libvirt-1-storage", # virsh snapshot-create-as VM --name snap --disk-only Domain snapshot snap created # ls -la /proc/PID/fd lr-x-- 1 root qemu 64 Apr 24 20:14 134 -> /path/to/harddisk.hdd lrwx-- 1 root qemu 64 Apr 24 20:14 135 -> /path/to/harddisk.snap # virsh qemu-monitor-command VM '{"execute": "query-named-block-nodes"}' --pretty | egrep '"node-name"|"ro"' "ro": false, "node-name": "libvirt-2-format", "ro": false, "node-name": "libvirt-2-storage", "ro": true, "node-name": "libvirt-1-format", "ro": false,<-- "node-name": "libvirt-1-storage", File descriptor has been reopened in RO, but "libvirt-1-storage" node still has RW permissions set. I'm wondering it this a bug or this is intended? Looks like a bug to me, although I see that some iotests (e.g. 273) expect 2 nodes related to the same image file to have different RO flags. bdrv_reopen_set_read_only() bdrv_reopen() bdrv_reopen_queue() bdrv_reopen_queue_child() bdrv_reopen_multiple() bdrv_list_refresh_perms() bdrv_topological_dfs() bdrv_do_refresh_perms() bdrv_reopen_commit() In the stack above bdrv_reopen_set_read_only() is only being called for the parent (libvirt-1-format) node. There're 2 lists: BDSs from refresh_list are used by bdrv_drv_set_perm and this leads to actual reopen with RO of the file descriptor. And then there's reopen queue bs_queue -- BDSs from this queue get their parameters updated. While refresh_list ends up having the whole subtree (including children, this is done in bdrv_topological_dfs()) bs_queue only has the parent. And that is because storage (child) node's (bs->inherits_from == NULL), so bdrv_reopen_queue_child() never adds it to the queue. Could it be the source of this bug? Anyway, would greatly appreciate a clarification. Andrey
Re: [PATCH 4/7] qcow2: make subclusters discardable
On 10/27/23 14:10, Jean-Louis Dupond wrote: > [...] > > I've checked all the code paths, and as far as I see it nowhere breaks > the discard_no_unref option. > It's important that we don't introduce new code paths that can make > holes in the qcow2 image when this option is enabled :) > > If you can confirm my conclusion, that would be great. > > > Thanks > Jean-Louis > Hi Jean-Louis, I've finally got to working on v2 for this series. However I'm failing to get a grasp on what this option is supposed to be doing and what are we trying to avoid here. Consider this simple example: # cd build # ./qemu-img create -f qcow2 unref.qcow2 192K # ./qemu-img create -f qcow2 nounref.qcow2 192K # ./qemu-io -c "write 0 192K" unref.qcow2 # ./qemu-io -c "write 0 192K" nounref.qcow2 # # strace -fv -e fallocate ./qemu-io -c "discard 64K 64K" unref.qcow2 [pid 887710] fallocate(9, FALLOC_FL_KEEP_SIZE|FALLOC_FL_PUNCH_HOLE, 393216, 65536) = 0 discard 65536/65536 bytes at offset 65536 64 KiB, 1 ops; 00.00 sec (252.123 MiB/sec and 4033.9660 ops/sec) # # strace -fv -e fallocate ./qemu-io -c "reopen -o discard-no-unref=on" -c "discard 64K 64K" nounref.qcow2 # [pid 887789] fallocate(9, FALLOC_FL_KEEP_SIZE|FALLOC_FL_PUNCH_HOLE, 393216, 65536) = 0 discard 65536/65536 bytes at offset 65536 64 KiB, 1 ops; 00.00 sec (345.457 MiB/sec and 5527.3049 ops/sec) # # ./qemu-img check unref.qcow2 No errors were found on the image. 2/3 = 66.67% allocated, 50.00% fragmented, 0.00% compressed clusters Image end offset: 524288 # ./qemu-img check nounref.qcow2 No errors were found on the image. 3/3 = 100.00% allocated, 0.00% fragmented, 0.00% compressed clusters Image end offset: 524288 # # ls -la *.qcow2 -rw-r--r-- 1 root root 524288 Apr 16 22:42 nounref.qcow2 -rw-r--r-- 1 root root 524288 Apr 16 22:41 unref.qcow2 # du --block-size=1 *.qcow2 397312 nounref.qcow2 397312 unref.qcow2 I understand that by keeping the L2 entry we achieve that cluster remains formally allocated, but no matter whether "discard-no-unref" option is enabled fallocate(FALLOC_FL_KEEP_SIZE|FALLOC_FL_PUNCH_HOLE) is being called leaving a hole in the file (e.g. file becomes sparse). However you say in the comment above that we can't allow making new holes in the file when this option is enabled. How does that correlate and what do we achieve? And which logic do you think we need to follow when discarding separate subclusters? Andrey
Re: [PATCH v4 1/7] qga: guest-get-fsinfo: add optional 'total-bytes-privileged' field
On 3/22/24 15:17, Andrey Drobyshev wrote: > On 3/22/24 12:39, Daniel P. Berrangé wrote: >> On Wed, Mar 20, 2024 at 06:16:42PM +0200, Andrey Drobyshev wrote: >>> Since the commit 25b5ff1a86 ("qga: add mountpoint usage info to >>> GuestFilesystemInfo") we have 2 values reported in guest-get-fsinfo: >>> used = (f_blocks - f_bfree), total = (f_blocks - f_bfree + f_bavail) as >>> returned by statvfs(3). While on Windows guests that's all we can get >>> with GetDiskFreeSpaceExA(), on POSIX guests we might also be interested in >>> total file system size, as it's visible for root user. Let's add an >>> optional field 'total-bytes-privileged' to GuestFilesystemInfo struct, >>> which'd only be reported on POSIX and represent f_blocks value as returned >>> by statvfs(3). >>> >>> While here, also tweak the docs to reflect better where those values >>> come from. >>> >>> Signed-off-by: Andrey Drobyshev >>> --- >>> qga/commands-posix.c | 2 ++ >>> qga/commands-win32.c | 1 + >>> qga/qapi-schema.json | 7 +-- >>> 3 files changed, 8 insertions(+), 2 deletions(-) >>> >>> diff --git a/qga/commands-posix.c b/qga/commands-posix.c >>> index 26008db497..7df2d72e9f 100644 >>> --- a/qga/commands-posix.c >>> +++ b/qga/commands-posix.c >>> @@ -1569,8 +1569,10 @@ static GuestFilesystemInfo >>> *build_guest_fsinfo(struct FsMount *mount, >>> nonroot_total = used + buf.f_bavail; >>> fs->used_bytes = used * fr_size; >>> fs->total_bytes = nonroot_total * fr_size; >>> +fs->total_bytes_privileged = buf.f_blocks * fr_size; >>> >>> fs->has_total_bytes = true; >>> +fs->has_total_bytes_privileged = true; >>> fs->has_used_bytes = true; >>> } >>> >>> diff --git a/qga/commands-win32.c b/qga/commands-win32.c >>> index 6242737b00..6fee0e1e6f 100644 >>> --- a/qga/commands-win32.c >>> +++ b/qga/commands-win32.c >>> @@ -1143,6 +1143,7 @@ static GuestFilesystemInfo *build_guest_fsinfo(char >>> *guid, Error **errp) >>> fs = g_malloc(sizeof(*fs)); >>> fs->name = g_strdup(guid); >>> fs->has_total_bytes = false; >>> +fs->has_total_bytes_privileged = false; >>> fs->has_used_bytes = false; >>> if (len == 0) { >>> fs->mountpoint = g_strdup("System Reserved"); >>> diff --git a/qga/qapi-schema.json b/qga/qapi-schema.json >>> index 9554b566a7..dcc469b268 100644 >>> --- a/qga/qapi-schema.json >>> +++ b/qga/qapi-schema.json >>> @@ -1026,7 +1026,10 @@ >>> # >>> # @used-bytes: file system used bytes (since 3.0) >>> # >>> -# @total-bytes: non-root file system total bytes (since 3.0) >>> +# @total-bytes: filesystem capacity in bytes for unprivileged users (since >>> 3.0) >>> +# >>> +# @total-bytes-privileged: filesystem capacity in bytes for privileged >>> users >>> +# (since 9.0) >> >> Will need to be '9.1', not '9.0', since we don't accept new features >> during freeze periods. >> >>> # >>> # @disk: an array of disk hardware information that the volume lies >>> # on, which may be empty if the disk type is not supported >>> @@ -1036,7 +1039,7 @@ >>> { 'struct': 'GuestFilesystemInfo', >>>'data': {'name': 'str', 'mountpoint': 'str', 'type': 'str', >>> '*used-bytes': 'uint64', '*total-bytes': 'uint64', >>> - 'disk': ['GuestDiskAddress']} } >>> + '*total-bytes-privileged': 'uint64', 'disk': >>> ['GuestDiskAddress']} } >>> >>> ## >>> # @guest-get-fsinfo: >> >> Assuming the version is changed: >> >> Reviewed-by: Daniel P. Berrangé >> >> With regards, >> Daniel > > Thanks, Daniel. > > @kkostiuk do I need to send one more iteration with the version change? > Whatever is most convenient for you. > > Andrey Ping
Re: [PATCH v4 1/7] qga: guest-get-fsinfo: add optional 'total-bytes-privileged' field
On 3/22/24 12:39, Daniel P. Berrangé wrote: > On Wed, Mar 20, 2024 at 06:16:42PM +0200, Andrey Drobyshev wrote: >> Since the commit 25b5ff1a86 ("qga: add mountpoint usage info to >> GuestFilesystemInfo") we have 2 values reported in guest-get-fsinfo: >> used = (f_blocks - f_bfree), total = (f_blocks - f_bfree + f_bavail) as >> returned by statvfs(3). While on Windows guests that's all we can get >> with GetDiskFreeSpaceExA(), on POSIX guests we might also be interested in >> total file system size, as it's visible for root user. Let's add an >> optional field 'total-bytes-privileged' to GuestFilesystemInfo struct, >> which'd only be reported on POSIX and represent f_blocks value as returned >> by statvfs(3). >> >> While here, also tweak the docs to reflect better where those values >> come from. >> >> Signed-off-by: Andrey Drobyshev >> --- >> qga/commands-posix.c | 2 ++ >> qga/commands-win32.c | 1 + >> qga/qapi-schema.json | 7 +-- >> 3 files changed, 8 insertions(+), 2 deletions(-) >> >> diff --git a/qga/commands-posix.c b/qga/commands-posix.c >> index 26008db497..7df2d72e9f 100644 >> --- a/qga/commands-posix.c >> +++ b/qga/commands-posix.c >> @@ -1569,8 +1569,10 @@ static GuestFilesystemInfo *build_guest_fsinfo(struct >> FsMount *mount, >> nonroot_total = used + buf.f_bavail; >> fs->used_bytes = used * fr_size; >> fs->total_bytes = nonroot_total * fr_size; >> +fs->total_bytes_privileged = buf.f_blocks * fr_size; >> >> fs->has_total_bytes = true; >> +fs->has_total_bytes_privileged = true; >> fs->has_used_bytes = true; >> } >> >> diff --git a/qga/commands-win32.c b/qga/commands-win32.c >> index 6242737b00..6fee0e1e6f 100644 >> --- a/qga/commands-win32.c >> +++ b/qga/commands-win32.c >> @@ -1143,6 +1143,7 @@ static GuestFilesystemInfo *build_guest_fsinfo(char >> *guid, Error **errp) >> fs = g_malloc(sizeof(*fs)); >> fs->name = g_strdup(guid); >> fs->has_total_bytes = false; >> +fs->has_total_bytes_privileged = false; >> fs->has_used_bytes = false; >> if (len == 0) { >> fs->mountpoint = g_strdup("System Reserved"); >> diff --git a/qga/qapi-schema.json b/qga/qapi-schema.json >> index 9554b566a7..dcc469b268 100644 >> --- a/qga/qapi-schema.json >> +++ b/qga/qapi-schema.json >> @@ -1026,7 +1026,10 @@ >> # >> # @used-bytes: file system used bytes (since 3.0) >> # >> -# @total-bytes: non-root file system total bytes (since 3.0) >> +# @total-bytes: filesystem capacity in bytes for unprivileged users (since >> 3.0) >> +# >> +# @total-bytes-privileged: filesystem capacity in bytes for privileged users >> +# (since 9.0) > > Will need to be '9.1', not '9.0', since we don't accept new features > during freeze periods. > >> # >> # @disk: an array of disk hardware information that the volume lies >> # on, which may be empty if the disk type is not supported >> @@ -1036,7 +1039,7 @@ >> { 'struct': 'GuestFilesystemInfo', >>'data': {'name': 'str', 'mountpoint': 'str', 'type': 'str', >> '*used-bytes': 'uint64', '*total-bytes': 'uint64', >> - 'disk': ['GuestDiskAddress']} } >> + '*total-bytes-privileged': 'uint64', 'disk': >> ['GuestDiskAddress']} } >> >> ## >> # @guest-get-fsinfo: > > Assuming the version is changed: > > Reviewed-by: Daniel P. Berrangé > > With regards, > Daniel Thanks, Daniel. @kkostiuk do I need to send one more iteration with the version change? Whatever is most convenient for you. Andrey
[PATCH v4 3/7] qga/commands-posix: qmp_guest_shutdown: use ga_run_command helper
Also remove the G_GNUC_UNUSED attribute added in the previous commit from the helper. Signed-off-by: Andrey Drobyshev Reviewed-by: Daniel P. Berrangé --- qga/commands-posix.c | 39 ++- 1 file changed, 6 insertions(+), 33 deletions(-) diff --git a/qga/commands-posix.c b/qga/commands-posix.c index 9b1bdf194c..cb9eed9a0b 100644 --- a/qga/commands-posix.c +++ b/qga/commands-posix.c @@ -108,7 +108,6 @@ static ssize_t ga_pipe_read_str(int fd[2], char **str) * sending string to stdin and taking error message from * stdout/err. */ -G_GNUC_UNUSED static int ga_run_command(const char *argv[], const char *in_str, const char *action, Error **errp) { @@ -230,8 +229,6 @@ void qmp_guest_shutdown(const char *mode, Error **errp) { const char *shutdown_flag; Error *local_err = NULL; -pid_t pid; -int status; #ifdef CONFIG_SOLARIS const char *powerdown_flag = "-i5"; @@ -260,46 +257,22 @@ void qmp_guest_shutdown(const char *mode, Error **errp) return; } -pid = fork(); -if (pid == 0) { -/* child, start the shutdown */ -setsid(); -reopen_fd_to_null(0); -reopen_fd_to_null(1); -reopen_fd_to_null(2); - +const char *argv[] = {"/sbin/shutdown", #ifdef CONFIG_SOLARIS -execl("/sbin/shutdown", "shutdown", shutdown_flag, "-g0", "-y", - "hypervisor initiated shutdown", (char *)NULL); + shutdown_flag, "-g0", "-y", #elif defined(CONFIG_BSD) -execl("/sbin/shutdown", "shutdown", shutdown_flag, "+0", - "hypervisor initiated shutdown", (char *)NULL); + shutdown_flag, "+0", #else -execl("/sbin/shutdown", "shutdown", "-h", shutdown_flag, "+0", - "hypervisor initiated shutdown", (char *)NULL); + "-h", shutdown_flag, "+0", #endif -_exit(EXIT_FAILURE); -} else if (pid < 0) { -error_setg_errno(errp, errno, "failed to create child process"); -return; -} + "hypervisor initiated shutdown", (char *) NULL}; -ga_wait_child(pid, , _err); +ga_run_command(argv, NULL, "shutdown", _err); if (local_err) { error_propagate(errp, local_err); return; } -if (!WIFEXITED(status)) { -error_setg(errp, "child process has terminated abnormally"); -return; -} - -if (WEXITSTATUS(status)) { -error_setg(errp, "child process has failed to shutdown"); -return; -} - /* succeeded */ } -- 2.39.3
[PATCH v4 1/7] qga: guest-get-fsinfo: add optional 'total-bytes-privileged' field
Since the commit 25b5ff1a86 ("qga: add mountpoint usage info to GuestFilesystemInfo") we have 2 values reported in guest-get-fsinfo: used = (f_blocks - f_bfree), total = (f_blocks - f_bfree + f_bavail) as returned by statvfs(3). While on Windows guests that's all we can get with GetDiskFreeSpaceExA(), on POSIX guests we might also be interested in total file system size, as it's visible for root user. Let's add an optional field 'total-bytes-privileged' to GuestFilesystemInfo struct, which'd only be reported on POSIX and represent f_blocks value as returned by statvfs(3). While here, also tweak the docs to reflect better where those values come from. Signed-off-by: Andrey Drobyshev --- qga/commands-posix.c | 2 ++ qga/commands-win32.c | 1 + qga/qapi-schema.json | 7 +-- 3 files changed, 8 insertions(+), 2 deletions(-) diff --git a/qga/commands-posix.c b/qga/commands-posix.c index 26008db497..7df2d72e9f 100644 --- a/qga/commands-posix.c +++ b/qga/commands-posix.c @@ -1569,8 +1569,10 @@ static GuestFilesystemInfo *build_guest_fsinfo(struct FsMount *mount, nonroot_total = used + buf.f_bavail; fs->used_bytes = used * fr_size; fs->total_bytes = nonroot_total * fr_size; +fs->total_bytes_privileged = buf.f_blocks * fr_size; fs->has_total_bytes = true; +fs->has_total_bytes_privileged = true; fs->has_used_bytes = true; } diff --git a/qga/commands-win32.c b/qga/commands-win32.c index 6242737b00..6fee0e1e6f 100644 --- a/qga/commands-win32.c +++ b/qga/commands-win32.c @@ -1143,6 +1143,7 @@ static GuestFilesystemInfo *build_guest_fsinfo(char *guid, Error **errp) fs = g_malloc(sizeof(*fs)); fs->name = g_strdup(guid); fs->has_total_bytes = false; +fs->has_total_bytes_privileged = false; fs->has_used_bytes = false; if (len == 0) { fs->mountpoint = g_strdup("System Reserved"); diff --git a/qga/qapi-schema.json b/qga/qapi-schema.json index 9554b566a7..dcc469b268 100644 --- a/qga/qapi-schema.json +++ b/qga/qapi-schema.json @@ -1026,7 +1026,10 @@ # # @used-bytes: file system used bytes (since 3.0) # -# @total-bytes: non-root file system total bytes (since 3.0) +# @total-bytes: filesystem capacity in bytes for unprivileged users (since 3.0) +# +# @total-bytes-privileged: filesystem capacity in bytes for privileged users +# (since 9.0) # # @disk: an array of disk hardware information that the volume lies # on, which may be empty if the disk type is not supported @@ -1036,7 +1039,7 @@ { 'struct': 'GuestFilesystemInfo', 'data': {'name': 'str', 'mountpoint': 'str', 'type': 'str', '*used-bytes': 'uint64', '*total-bytes': 'uint64', - 'disk': ['GuestDiskAddress']} } + '*total-bytes-privileged': 'uint64', 'disk': ['GuestDiskAddress']} } ## # @guest-get-fsinfo: -- 2.39.3
[PATCH v4 5/7] qga/commands-posix: execute_fsfreeze_hook: use ga_run_command helper
There's no need to check for the existence of the hook executable, as the exec() call will do that for us. Signed-off-by: Andrey Drobyshev Reviewed-by: Daniel P. Berrangé --- qga/commands-posix.c | 35 +++ 1 file changed, 3 insertions(+), 32 deletions(-) diff --git a/qga/commands-posix.c b/qga/commands-posix.c index 545f3c99dc..9b993772f5 100644 --- a/qga/commands-posix.c +++ b/qga/commands-posix.c @@ -736,8 +736,6 @@ static const char *fsfreeze_hook_arg_string[] = { static void execute_fsfreeze_hook(FsfreezeHookArg arg, Error **errp) { -int status; -pid_t pid; const char *hook; const char *arg_str = fsfreeze_hook_arg_string[arg]; Error *local_err = NULL; @@ -746,42 +744,15 @@ static void execute_fsfreeze_hook(FsfreezeHookArg arg, Error **errp) if (!hook) { return; } -if (access(hook, X_OK) != 0) { -error_setg_errno(errp, errno, "can't access fsfreeze hook '%s'", hook); -return; -} -slog("executing fsfreeze hook with arg '%s'", arg_str); -pid = fork(); -if (pid == 0) { -setsid(); -reopen_fd_to_null(0); -reopen_fd_to_null(1); -reopen_fd_to_null(2); - -execl(hook, hook, arg_str, NULL); -_exit(EXIT_FAILURE); -} else if (pid < 0) { -error_setg_errno(errp, errno, "failed to create child process"); -return; -} +const char *argv[] = {hook, arg_str, NULL}; -ga_wait_child(pid, , _err); +slog("executing fsfreeze hook with arg '%s'", arg_str); +ga_run_command(argv, NULL, "execute fsfreeze hook", _err); if (local_err) { error_propagate(errp, local_err); return; } - -if (!WIFEXITED(status)) { -error_setg(errp, "fsfreeze hook has terminated abnormally"); -return; -} - -status = WEXITSTATUS(status); -if (status) { -error_setg(errp, "fsfreeze hook has failed with status %d", status); -return; -} } /* -- 2.39.3
[PATCH v4 0/7] qga/commands-posix: replace code duplicating commands with a helper
v3 -> v4: * Patch 1/7: - Replaced "since 8.3" with "since 9.0" as we're now at v9.0.0-rc0; - Renamed the field to 'total-bytes-privileged'; - Got rid of the implementation details in the docs; * Patch 6/7: added g_autoptr macro to local error declaration. v3: https://lists.nongnu.org/archive/html/qemu-devel/2024-03/msg04068.html Andrey Drobyshev (7): qga: guest-get-fsinfo: add optional 'total-bytes-privileged' field qga: introduce ga_run_command() helper for guest cmd execution qga/commands-posix: qmp_guest_shutdown: use ga_run_command helper qga/commands-posix: qmp_guest_set_time: use ga_run_command helper qga/commands-posix: execute_fsfreeze_hook: use ga_run_command helper qga/commands-posix: don't do fork()/exec() when suspending via sysfs qga/commands-posix: qmp_guest_set_user_password: use ga_run_command helper qga/commands-posix.c | 404 +++ qga/commands-win32.c | 1 + qga/qapi-schema.json | 7 +- 3 files changed, 187 insertions(+), 225 deletions(-) -- 2.39.3
[PATCH v4 6/7] qga/commands-posix: don't do fork()/exec() when suspending via sysfs
Since commit 246d76eba ("qga: guest_suspend: decoupling pm-utils and sys logic") pm-utils logic is running in a separate child from the sysfs logic. Now when suspending via sysfs we don't really need to do that in a separate process as we only need to perform one write to /sys/power/state. Let's just use g_file_set_contents() to simplify things here. Suggested-by: Daniel P. Berrangé Signed-off-by: Andrey Drobyshev Reviewed-by: Daniel P. Berrangé --- qga/commands-posix.c | 41 + 1 file changed, 5 insertions(+), 36 deletions(-) diff --git a/qga/commands-posix.c b/qga/commands-posix.c index 9b993772f5..9910957ff5 100644 --- a/qga/commands-posix.c +++ b/qga/commands-posix.c @@ -1928,52 +1928,21 @@ static bool linux_sys_state_supports_mode(SuspendMode mode, Error **errp) static void linux_sys_state_suspend(SuspendMode mode, Error **errp) { -Error *local_err = NULL; +g_autoptr(GError) local_gerr = NULL; const char *sysfile_strs[3] = {"disk", "mem", NULL}; const char *sysfile_str = sysfile_strs[mode]; -pid_t pid; -int status; if (!sysfile_str) { error_setg(errp, "unknown guest suspend mode"); return; } -pid = fork(); -if (!pid) { -/* child */ -int fd; - -setsid(); -reopen_fd_to_null(0); -reopen_fd_to_null(1); -reopen_fd_to_null(2); - -fd = open(LINUX_SYS_STATE_FILE, O_WRONLY); -if (fd < 0) { -_exit(EXIT_FAILURE); -} - -if (write(fd, sysfile_str, strlen(sysfile_str)) < 0) { -_exit(EXIT_FAILURE); -} - -_exit(EXIT_SUCCESS); -} else if (pid < 0) { -error_setg_errno(errp, errno, "failed to create child process"); -return; -} - -ga_wait_child(pid, , _err); -if (local_err) { -error_propagate(errp, local_err); +if (!g_file_set_contents(LINUX_SYS_STATE_FILE, sysfile_str, + -1, _gerr)) { +error_setg(errp, "suspend: cannot write to '%s': %s", + LINUX_SYS_STATE_FILE, local_gerr->message); return; } - -if (WEXITSTATUS(status)) { -error_setg(errp, "child process has failed to suspend"); -} - } static void guest_suspend(SuspendMode mode, Error **errp) -- 2.39.3
[PATCH v4 4/7] qga/commands-posix: qmp_guest_set_time: use ga_run_command helper
There's no need to check for the existence of "/sbin/hwclock", the exec() call will do that for us. Signed-off-by: Andrey Drobyshev Reviewed-by: Daniel P. Berrangé --- qga/commands-posix.c | 43 +++ 1 file changed, 3 insertions(+), 40 deletions(-) diff --git a/qga/commands-posix.c b/qga/commands-posix.c index cb9eed9a0b..545f3c99dc 100644 --- a/qga/commands-posix.c +++ b/qga/commands-posix.c @@ -279,21 +279,9 @@ void qmp_guest_shutdown(const char *mode, Error **errp) void qmp_guest_set_time(bool has_time, int64_t time_ns, Error **errp) { int ret; -int status; -pid_t pid; Error *local_err = NULL; struct timeval tv; -static const char hwclock_path[] = "/sbin/hwclock"; -static int hwclock_available = -1; - -if (hwclock_available < 0) { -hwclock_available = (access(hwclock_path, X_OK) == 0); -} - -if (!hwclock_available) { -error_setg(errp, QERR_UNSUPPORTED); -return; -} +const char *argv[] = {"/sbin/hwclock", has_time ? "-w" : "-s", NULL}; /* If user has passed a time, validate and set it. */ if (has_time) { @@ -324,37 +312,12 @@ void qmp_guest_set_time(bool has_time, int64_t time_ns, Error **errp) * just need to synchronize the hardware clock. However, if no time was * passed, user is requesting the opposite: set the system time from the * hardware clock (RTC). */ -pid = fork(); -if (pid == 0) { -setsid(); -reopen_fd_to_null(0); -reopen_fd_to_null(1); -reopen_fd_to_null(2); - -/* Use '/sbin/hwclock -w' to set RTC from the system time, - * or '/sbin/hwclock -s' to set the system time from RTC. */ -execl(hwclock_path, "hwclock", has_time ? "-w" : "-s", NULL); -_exit(EXIT_FAILURE); -} else if (pid < 0) { -error_setg_errno(errp, errno, "failed to create child process"); -return; -} - -ga_wait_child(pid, , _err); +ga_run_command(argv, NULL, "set hardware clock to system time", + _err); if (local_err) { error_propagate(errp, local_err); return; } - -if (!WIFEXITED(status)) { -error_setg(errp, "child process has terminated abnormally"); -return; -} - -if (WEXITSTATUS(status)) { -error_setg(errp, "hwclock failed to set hardware clock to system time"); -return; -} } typedef enum { -- 2.39.3
[PATCH v4 7/7] qga/commands-posix: qmp_guest_set_user_password: use ga_run_command helper
There's no need to check for the existence of the "chpasswd", "pw" executables, as the exec() call will do that for us. Signed-off-by: Andrey Drobyshev Reviewed-by: Daniel P. Berrangé --- qga/commands-posix.c | 96 ++-- 1 file changed, 13 insertions(+), 83 deletions(-) diff --git a/qga/commands-posix.c b/qga/commands-posix.c index 9910957ff5..7a065c4085 100644 --- a/qga/commands-posix.c +++ b/qga/commands-posix.c @@ -2151,14 +2151,8 @@ void qmp_guest_set_user_password(const char *username, Error **errp) { Error *local_err = NULL; -char *passwd_path = NULL; -pid_t pid; -int status; -int datafd[2] = { -1, -1 }; -char *rawpasswddata = NULL; +g_autofree char *rawpasswddata = NULL; size_t rawpasswdlen; -char *chpasswddata = NULL; -size_t chpasswdlen; rawpasswddata = (char *)qbase64_decode(password, -1, , errp); if (!rawpasswddata) { @@ -2169,95 +2163,31 @@ void qmp_guest_set_user_password(const char *username, if (strchr(rawpasswddata, '\n')) { error_setg(errp, "forbidden characters in raw password"); -goto out; +return; } if (strchr(username, '\n') || strchr(username, ':')) { error_setg(errp, "forbidden characters in username"); -goto out; +return; } #ifdef __FreeBSD__ -chpasswddata = g_strdup(rawpasswddata); -passwd_path = g_find_program_in_path("pw"); +g_autofree char *chpasswdata = g_strdup(rawpasswddata); +const char *crypt_flag = crypted ? "-H" : "-h"; +const char *argv[] = {"pw", "usermod", "-n", username, + crypt_flag, "0", NULL}; #else -chpasswddata = g_strdup_printf("%s:%s\n", username, rawpasswddata); -passwd_path = g_find_program_in_path("chpasswd"); +g_autofree char *chpasswddata = g_strdup_printf("%s:%s\n", username, +rawpasswddata); +const char *crypt_flag = crypted ? "-e" : NULL; +const char *argv[] = {"chpasswd", crypt_flag, NULL}; #endif -chpasswdlen = strlen(chpasswddata); - -if (!passwd_path) { -error_setg(errp, "cannot find 'passwd' program in PATH"); -goto out; -} - -if (!g_unix_open_pipe(datafd, FD_CLOEXEC, NULL)) { -error_setg(errp, "cannot create pipe FDs"); -goto out; -} - -pid = fork(); -if (pid == 0) { -close(datafd[1]); -/* child */ -setsid(); -dup2(datafd[0], 0); -reopen_fd_to_null(1); -reopen_fd_to_null(2); - -#ifdef __FreeBSD__ -const char *h_arg; -h_arg = (crypted) ? "-H" : "-h"; -execl(passwd_path, "pw", "usermod", "-n", username, h_arg, "0", NULL); -#else -if (crypted) { -execl(passwd_path, "chpasswd", "-e", NULL); -} else { -execl(passwd_path, "chpasswd", NULL); -} -#endif -_exit(EXIT_FAILURE); -} else if (pid < 0) { -error_setg_errno(errp, errno, "failed to create child process"); -goto out; -} -close(datafd[0]); -datafd[0] = -1; - -if (qemu_write_full(datafd[1], chpasswddata, chpasswdlen) != chpasswdlen) { -error_setg_errno(errp, errno, "cannot write new account password"); -goto out; -} -close(datafd[1]); -datafd[1] = -1; - -ga_wait_child(pid, , _err); +ga_run_command(argv, chpasswddata, "set user password", _err); if (local_err) { error_propagate(errp, local_err); -goto out; -} - -if (!WIFEXITED(status)) { -error_setg(errp, "child process has terminated abnormally"); -goto out; -} - -if (WEXITSTATUS(status)) { -error_setg(errp, "child process has failed to set user password"); -goto out; -} - -out: -g_free(chpasswddata); -g_free(rawpasswddata); -g_free(passwd_path); -if (datafd[0] != -1) { -close(datafd[0]); -} -if (datafd[1] != -1) { -close(datafd[1]); +return; } } #else /* __linux__ || __FreeBSD__ */ -- 2.39.3
[PATCH v4 2/7] qga: introduce ga_run_command() helper for guest cmd execution
When executing guest commands in *nix environment, we repeat the same fork/exec pattern multiple times. Let's just separate it into a single helper which would also be able to feed input data into the launched process' stdin. This way we can avoid code duplication. To keep the history more bisectable, let's replace qmp commands implementations one by one. Also add G_GNUC_UNUSED attribute to the helper and remove it in the next commit. Originally-by: Yuri Pudgorodskiy Signed-off-by: Andrey Drobyshev Reviewed-by: Daniel P. Berrangé --- qga/commands-posix.c | 150 +++ 1 file changed, 150 insertions(+) diff --git a/qga/commands-posix.c b/qga/commands-posix.c index 7df2d72e9f..9b1bdf194c 100644 --- a/qga/commands-posix.c +++ b/qga/commands-posix.c @@ -76,6 +76,156 @@ static void ga_wait_child(pid_t pid, int *status, Error **errp) g_assert(rpid == pid); } +static ssize_t ga_pipe_read_str(int fd[2], char **str) +{ +ssize_t n, len = 0; +char buf[1024]; + +close(fd[1]); +fd[1] = -1; +while ((n = read(fd[0], buf, sizeof(buf))) != 0) { +if (n < 0) { +if (errno == EINTR) { +continue; +} else { +len = -errno; +break; +} +} +*str = g_realloc(*str, len + n + 1); +memcpy(*str + len, buf, n); +len += n; +*str[len] = '\0'; +} +close(fd[0]); +fd[0] = -1; + +return len; +} + +/* + * Helper to run command with input/output redirection, + * sending string to stdin and taking error message from + * stdout/err. + */ +G_GNUC_UNUSED +static int ga_run_command(const char *argv[], const char *in_str, + const char *action, Error **errp) +{ +pid_t pid; +int status; +int retcode = -1; +int infd[2] = { -1, -1 }; +int outfd[2] = { -1, -1 }; +char *str = NULL; +ssize_t len = 0; + +if ((in_str && !g_unix_open_pipe(infd, FD_CLOEXEC, NULL)) || +!g_unix_open_pipe(outfd, FD_CLOEXEC, NULL)) { +error_setg(errp, "cannot create pipe FDs"); +goto out; +} + +pid = fork(); +if (pid == 0) { +char *cherr = NULL; + +setsid(); + +if (in_str) { +/* Redirect stdin to infd. */ +close(infd[1]); +dup2(infd[0], 0); +close(infd[0]); +} else { +reopen_fd_to_null(0); +} + +/* Redirect stdout/stderr to outfd. */ +close(outfd[0]); +dup2(outfd[1], 1); +dup2(outfd[1], 2); +close(outfd[1]); + +execvp(argv[0], (char *const *)argv); + +/* Write the cause of failed exec to pipe for the parent to read it. */ +cherr = g_strdup_printf("failed to exec '%s'", argv[0]); +perror(cherr); +g_free(cherr); +_exit(EXIT_FAILURE); +} else if (pid < 0) { +error_setg_errno(errp, errno, "failed to create child process"); +goto out; +} + +if (in_str) { +close(infd[0]); +infd[0] = -1; +if (qemu_write_full(infd[1], in_str, strlen(in_str)) != +strlen(in_str)) { +error_setg_errno(errp, errno, "%s: cannot write to stdin pipe", + action); +goto out; +} +close(infd[1]); +infd[1] = -1; +} + +len = ga_pipe_read_str(outfd, ); +if (len < 0) { +error_setg_errno(errp, -len, "%s: cannot read from stdout/stderr pipe", + action); +goto out; +} + +ga_wait_child(pid, , errp); +if (*errp) { +goto out; +} + +if (!WIFEXITED(status)) { +if (len) { +error_setg(errp, "child process has terminated abnormally: %s", + str); +} else { +error_setg(errp, "child process has terminated abnormally"); +} +goto out; +} + +retcode = WEXITSTATUS(status); + +if (WEXITSTATUS(status)) { +if (len) { +error_setg(errp, "child process has failed to %s: %s", + action, str); +} else { +error_setg(errp, "child process has failed to %s: exit status %d", + action, WEXITSTATUS(status)); +} +goto out; +} + +out: +g_free(str); + +if (infd[0] != -1) { +close(infd[0]); +} +if (infd[1] != -1) { +close(infd[1]); +} +if (outfd[0] != -1) { +close(outfd[0]); +} +if (outfd[1] != -1) { +close(outfd[1]); +} + +return retcode; +} + void qmp_guest_shutdown(const char *mode, Error **errp) { const char *shutdown_flag; -- 2.39.3
Re: [PATCH v3 1/7] qga: guest-get-fsinfo: add optional 'total-bytes-root' field
On 3/19/24 19:37, Daniel P. Berrangé wrote: > On Fri, Mar 15, 2024 at 02:29:40PM +0200, Andrey Drobyshev wrote: >> Since the commit 25b5ff1a86 ("qga: add mountpoint usage info to >> GuestFilesystemInfo") we have 2 values reported in guest-get-fsinfo: >> used = (f_blocks - f_bfree), total = (f_blocks - f_bfree + f_bavail) as >> returned by statvfs(3). While on Windows guests that's all we can get >> with GetDiskFreeSpaceExA(), on POSIX guests we might also be interested in >> total file system size, as it's visible for root user. Let's add an >> optional field 'total-bytes-root' to GuestFilesystemInfo struct, which'd >> only be reported on POSIX and represent f_blocks value as returned by >> statvfs(3). >> >> While here, let's document better where those values come from in both >> POSIX and Windows. >> >> Signed-off-by: Andrey Drobyshev >> --- >> qga/commands-posix.c | 2 ++ >> qga/commands-win32.c | 1 + >> qga/qapi-schema.json | 12 +++- >> 3 files changed, 14 insertions(+), 1 deletion(-) >> >> diff --git a/qga/commands-posix.c b/qga/commands-posix.c >> index 26008db497..8207c4c47e 100644 >> --- a/qga/commands-posix.c >> +++ b/qga/commands-posix.c >> @@ -1569,8 +1569,10 @@ static GuestFilesystemInfo *build_guest_fsinfo(struct >> FsMount *mount, >> nonroot_total = used + buf.f_bavail; >> fs->used_bytes = used * fr_size; >> fs->total_bytes = nonroot_total * fr_size; >> +fs->total_bytes_root = buf.f_blocks * fr_size; >> >> fs->has_total_bytes = true; >> +fs->has_total_bytes_root = true; >> fs->has_used_bytes = true; >> } >> >> diff --git a/qga/commands-win32.c b/qga/commands-win32.c >> index a1015757d8..9e820aad8d 100644 >> --- a/qga/commands-win32.c >> +++ b/qga/commands-win32.c >> @@ -1143,6 +1143,7 @@ static GuestFilesystemInfo *build_guest_fsinfo(char >> *guid, Error **errp) >> fs = g_malloc(sizeof(*fs)); >> fs->name = g_strdup(guid); >> fs->has_total_bytes = false; >> +fs->has_total_bytes_root = false; > > Can we use GetDiskSpaceInformationA to return this information > on Windows ? In contrast to GetDiskFreeSpaceExA(), the > DISK_SPACE_INFORMATION struct details both the real sizes > and the current user available sizes. > It seems that this API has only been included in mingw64 recently: https://github.com/mingw-w64/mingw-w64/commit/66546556 Apparently since it happened there hasn't been a new release of mingw64, so the latest version v11.0.1 still doesn't have it. So I guess we have no choice but to leave Win fields as-is for now and switch to the new API later. >> fs->has_used_bytes = false; >> if (len == 0) { >> fs->mountpoint = g_strdup("System Reserved"); >> diff --git a/qga/qapi-schema.json b/qga/qapi-schema.json >> index b8efe31897..093a5ab602 100644 >> --- a/qga/qapi-schema.json >> +++ b/qga/qapi-schema.json >> @@ -1031,8 +1031,18 @@ >> # @type: file system type string >> # >> # @used-bytes: file system used bytes (since 3.0) >> +# * POSIX: (f_blocks - f_bfree) * f_frsize, as returned by statvfs(3) >> +# * Windows: (TotalNumberOfBytes - TotalNumberOfFreeBytes), as returned >> +# by GetDiskFreeSpaceEx() >> # >> # @total-bytes: non-root file system total bytes (since 3.0) >> +# * POSIX: (f_blocks - f_bfree + f_bavail) * f_frsize, as returned by >> +# statvfs(3) >> +# * Windows: TotalNumberOfBytes, as returned by GetDiskFreeSpaceEx() >> +# >> +# @total-bytes-root: total file system size in bytes (as visible for a >> +# priviledged user) (since 8.3) >> +# * POSIX only: (f_blocks * f_frsize), returned by statvfs(3) > > I tend to wonder whether it is really a good idea to document > our specific implementation details in the public API > > I might suggest > > @total-bytes: filesystem capacity in bytes for unprivileged users > @total-bytes-root: filesystem capacity in bytes for privileged users > My initial intent was to get rid of the necessity to dig into the sources in order to understand what those values mean. But since we're discussing changes in the implementation, I guess it is wise not to mention those details indeed. > also should we call it 'total-bytes-privilegd', to avoid UNIX specific > terminology. > Agreed. >> # >> # @disk: an array of disk hardware information that the volume lies >> # on, which may be empty if the disk type is not supported >> @@ -1042,7 +1052,7 @@ >> { 'struct': 'GuestFilesystemInfo', >>'data': {'name': 'str', 'mountpoint': 'str', 'type': 'str', >> '*used-bytes': 'uint64', '*total-bytes': 'uint64', >> - 'disk': ['GuestDiskAddress']} } >> + '*total-bytes-root': 'uint64', 'disk': ['GuestDiskAddress']} } >> >> ## >> # @guest-get-fsinfo: >> -- >> 2.39.3 >> > > With regards, > Daniel
Re: [PATCH v3 6/7] qga/commands-posix: don't do fork()/exec() when suspending via sysfs
On 3/19/24 20:02, Daniel P. Berrangé wrote: > On Fri, Mar 15, 2024 at 02:29:45PM +0200, Andrey Drobyshev wrote: >> Since commit 246d76eba ("qga: guest_suspend: decoupling pm-utils and sys >> logic") pm-utils logic is running in a separate child from the sysfs >> logic. Now when suspending via sysfs we don't really need to do that in >> a separate process as we only need to perform one write to /sys/power/state. >> >> Let's just use g_file_set_contents() to simplify things here. >> >> Suggested-by: Daniel P. Berrangé >> Signed-off-by: Andrey Drobyshev >> --- >> qga/commands-posix.c | 41 + >> 1 file changed, 5 insertions(+), 36 deletions(-) > > Reviewed-by: Daniel P. Berrangé > >> >> diff --git a/qga/commands-posix.c b/qga/commands-posix.c >> index 610d225d30..e0ea377f65 100644 >> --- a/qga/commands-posix.c >> +++ b/qga/commands-posix.c >> @@ -1928,52 +1928,21 @@ static bool >> linux_sys_state_supports_mode(SuspendMode mode, Error **errp) >> >> static void linux_sys_state_suspend(SuspendMode mode, Error **errp) >> { >> -Error *local_err = NULL; >> +GError *local_gerr = NULL; >> const char *sysfile_strs[3] = {"disk", "mem", NULL}; >> const char *sysfile_str = sysfile_strs[mode]; >> -pid_t pid; >> -int status; >> >> if (!sysfile_str) { >> error_setg(errp, "unknown guest suspend mode"); >> return; >> } >> >> -pid = fork(); >> -if (!pid) { >> -/* child */ >> -int fd; >> - >> -setsid(); >> -reopen_fd_to_null(0); >> -reopen_fd_to_null(1); >> -reopen_fd_to_null(2); >> - >> -fd = open(LINUX_SYS_STATE_FILE, O_WRONLY); >> -if (fd < 0) { >> -_exit(EXIT_FAILURE); >> -} >> - >> -if (write(fd, sysfile_str, strlen(sysfile_str)) < 0) { >> -_exit(EXIT_FAILURE); >> -} >> - >> -_exit(EXIT_SUCCESS); >> -} else if (pid < 0) { >> -error_setg_errno(errp, errno, "failed to create child process"); >> -return; >> -} >> - >> -ga_wait_child(pid, , _err); >> -if (local_err) { >> -error_propagate(errp, local_err); >> +if (!g_file_set_contents(LINUX_SYS_STATE_FILE, sysfile_str, >> + -1, _gerr)) { >> +error_setg(errp, "suspend: cannot write to '%s': %s", >> + LINUX_SYS_STATE_FILE, local_gerr->message); > > You need to declare with "g_autoptr(GError) local_gerr = NULL" to > avoid a leak here. > Of course, thanks for noticing. >> return; >> } >> - >> -if (WEXITSTATUS(status)) { >> -error_setg(errp, "child process has failed to suspend"); >> -} >> - >> } >> >> static void guest_suspend(SuspendMode mode, Error **errp) >> -- >> 2.39.3 >> > > With regards, > Daniel
Re: [PATCH v3 1/7] qga: guest-get-fsinfo: add optional 'total-bytes-root' field
On 3/15/24 15:44, Markus Armbruster wrote: > [?? ??? ? ?? ?? arm...@redhat.com. ???, ?? ??? ?, > ?? ?? https://aka.ms/LearnAboutSenderIdentification ] > > Andrey Drobyshev writes: > >> Since the commit 25b5ff1a86 ("qga: add mountpoint usage info to >> GuestFilesystemInfo") we have 2 values reported in guest-get-fsinfo: >> used = (f_blocks - f_bfree), total = (f_blocks - f_bfree + f_bavail) as >> returned by statvfs(3). While on Windows guests that's all we can get >> with GetDiskFreeSpaceExA(), on POSIX guests we might also be interested in >> total file system size, as it's visible for root user. Let's add an >> optional field 'total-bytes-root' to GuestFilesystemInfo struct, which'd >> only be reported on POSIX and represent f_blocks value as returned by >> statvfs(3). >> >> While here, let's document better where those values come from in both >> POSIX and Windows. >> >> Signed-off-by: Andrey Drobyshev > > [...] > >> diff --git a/qga/qapi-schema.json b/qga/qapi-schema.json >> index b8efe31897..093a5ab602 100644 >> --- a/qga/qapi-schema.json >> +++ b/qga/qapi-schema.json >> @@ -1031,8 +1031,18 @@ >> # @type: file system type string >> # >> # @used-bytes: file system used bytes (since 3.0) >> +# * POSIX: (f_blocks - f_bfree) * f_frsize, as returned by statvfs(3) >> +# * Windows: (TotalNumberOfBytes - TotalNumberOfFreeBytes), as returned >> +# by GetDiskFreeSpaceEx() >> # >> # @total-bytes: non-root file system total bytes (since 3.0) >> +# * POSIX: (f_blocks - f_bfree + f_bavail) * f_frsize, as returned by >> +# statvfs(3) >> +# * Windows: TotalNumberOfBytes, as returned by GetDiskFreeSpaceEx() >> +# >> +# @total-bytes-root: total file system size in bytes (as visible for a >> +# priviledged user) (since 8.3) > > privileged > >> +# * POSIX only: (f_blocks * f_frsize), returned by statvfs(3) >> # >> # @disk: an array of disk hardware information that the volume lies >> # on, which may be empty if the disk type is not supported >> @@ -1042,7 +1052,7 @@ >> { 'struct': 'GuestFilesystemInfo', >>'data': {'name': 'str', 'mountpoint': 'str', 'type': 'str', >> '*used-bytes': 'uint64', '*total-bytes': 'uint64', >> - 'disk': ['GuestDiskAddress']} } >> + '*total-bytes-root': 'uint64', 'disk': ['GuestDiskAddress']} } >> >> ## >> # @guest-get-fsinfo: > > Fails to build the manual: > > qga/qapi-schema.json:1019:Unexpected indentation. > > To fix, add blank lines before the lists, like this: > ># @used-bytes: file system used bytes (since 3.0) ># ># * POSIX: (f_blocks - f_bfree) * f_frsize, as returned by ># statvfs(3) ># * Windows: (TotalNumberOfBytes - TotalNumberOfFreeBytes), as ># returned by GetDiskFreeSpaceEx() ># ># @total-bytes: non-root file system total bytes (since 3.0) ># ># * POSIX: (f_blocks - f_bfree + f_bavail) * f_frsize, as returned by ># statvfs(3) ># * Windows: TotalNumberOfBytes, as returned by GetDiskFreeSpaceEx() ># ># @total-bytes-root: total file system size in bytes (as visible for a ># privileged user) (since 8.3) ># ># * POSIX only: (f_blocks * f_frsize), returned by statvfs(3) ># > > Yes, reST can be quite annoying. > For some reason in my environment build doesn't fail and file build/docs/qemu-ga-ref.7 is produced. However lists aren't indeed formatted properly. I'll wait for other patches to be reviewed and fix that in v4. Thank you for noticing.
[PATCH v3 5/7] qga/commands-posix: execute_fsfreeze_hook: use ga_run_command helper
There's no need to check for the existence of the hook executable, as the exec() call will do that for us. Signed-off-by: Andrey Drobyshev --- qga/commands-posix.c | 35 +++ 1 file changed, 3 insertions(+), 32 deletions(-) diff --git a/qga/commands-posix.c b/qga/commands-posix.c index 94b652d54e..610d225d30 100644 --- a/qga/commands-posix.c +++ b/qga/commands-posix.c @@ -736,8 +736,6 @@ static const char *fsfreeze_hook_arg_string[] = { static void execute_fsfreeze_hook(FsfreezeHookArg arg, Error **errp) { -int status; -pid_t pid; const char *hook; const char *arg_str = fsfreeze_hook_arg_string[arg]; Error *local_err = NULL; @@ -746,42 +744,15 @@ static void execute_fsfreeze_hook(FsfreezeHookArg arg, Error **errp) if (!hook) { return; } -if (access(hook, X_OK) != 0) { -error_setg_errno(errp, errno, "can't access fsfreeze hook '%s'", hook); -return; -} -slog("executing fsfreeze hook with arg '%s'", arg_str); -pid = fork(); -if (pid == 0) { -setsid(); -reopen_fd_to_null(0); -reopen_fd_to_null(1); -reopen_fd_to_null(2); - -execl(hook, hook, arg_str, NULL); -_exit(EXIT_FAILURE); -} else if (pid < 0) { -error_setg_errno(errp, errno, "failed to create child process"); -return; -} +const char *argv[] = {hook, arg_str, NULL}; -ga_wait_child(pid, , _err); +slog("executing fsfreeze hook with arg '%s'", arg_str); +ga_run_command(argv, NULL, "execute fsfreeze hook", _err); if (local_err) { error_propagate(errp, local_err); return; } - -if (!WIFEXITED(status)) { -error_setg(errp, "fsfreeze hook has terminated abnormally"); -return; -} - -status = WEXITSTATUS(status); -if (status) { -error_setg(errp, "fsfreeze hook has failed with status %d", status); -return; -} } /* -- 2.39.3
[PATCH v3 7/7] qga/commands-posix: qmp_guest_set_user_password: use ga_run_command helper
There's no need to check for the existence of the "chpasswd", "pw" executables, as the exec() call will do that for us. Signed-off-by: Andrey Drobyshev --- qga/commands-posix.c | 96 ++-- 1 file changed, 13 insertions(+), 83 deletions(-) diff --git a/qga/commands-posix.c b/qga/commands-posix.c index e0ea377f65..ffea69c3f0 100644 --- a/qga/commands-posix.c +++ b/qga/commands-posix.c @@ -2151,14 +2151,8 @@ void qmp_guest_set_user_password(const char *username, Error **errp) { Error *local_err = NULL; -char *passwd_path = NULL; -pid_t pid; -int status; -int datafd[2] = { -1, -1 }; -char *rawpasswddata = NULL; +g_autofree char *rawpasswddata = NULL; size_t rawpasswdlen; -char *chpasswddata = NULL; -size_t chpasswdlen; rawpasswddata = (char *)qbase64_decode(password, -1, , errp); if (!rawpasswddata) { @@ -2169,95 +2163,31 @@ void qmp_guest_set_user_password(const char *username, if (strchr(rawpasswddata, '\n')) { error_setg(errp, "forbidden characters in raw password"); -goto out; +return; } if (strchr(username, '\n') || strchr(username, ':')) { error_setg(errp, "forbidden characters in username"); -goto out; +return; } #ifdef __FreeBSD__ -chpasswddata = g_strdup(rawpasswddata); -passwd_path = g_find_program_in_path("pw"); +g_autofree char *chpasswdata = g_strdup(rawpasswddata); +const char *crypt_flag = crypted ? "-H" : "-h"; +const char *argv[] = {"pw", "usermod", "-n", username, + crypt_flag, "0", NULL}; #else -chpasswddata = g_strdup_printf("%s:%s\n", username, rawpasswddata); -passwd_path = g_find_program_in_path("chpasswd"); +g_autofree char *chpasswddata = g_strdup_printf("%s:%s\n", username, +rawpasswddata); +const char *crypt_flag = crypted ? "-e" : NULL; +const char *argv[] = {"chpasswd", crypt_flag, NULL}; #endif -chpasswdlen = strlen(chpasswddata); - -if (!passwd_path) { -error_setg(errp, "cannot find 'passwd' program in PATH"); -goto out; -} - -if (!g_unix_open_pipe(datafd, FD_CLOEXEC, NULL)) { -error_setg(errp, "cannot create pipe FDs"); -goto out; -} - -pid = fork(); -if (pid == 0) { -close(datafd[1]); -/* child */ -setsid(); -dup2(datafd[0], 0); -reopen_fd_to_null(1); -reopen_fd_to_null(2); - -#ifdef __FreeBSD__ -const char *h_arg; -h_arg = (crypted) ? "-H" : "-h"; -execl(passwd_path, "pw", "usermod", "-n", username, h_arg, "0", NULL); -#else -if (crypted) { -execl(passwd_path, "chpasswd", "-e", NULL); -} else { -execl(passwd_path, "chpasswd", NULL); -} -#endif -_exit(EXIT_FAILURE); -} else if (pid < 0) { -error_setg_errno(errp, errno, "failed to create child process"); -goto out; -} -close(datafd[0]); -datafd[0] = -1; - -if (qemu_write_full(datafd[1], chpasswddata, chpasswdlen) != chpasswdlen) { -error_setg_errno(errp, errno, "cannot write new account password"); -goto out; -} -close(datafd[1]); -datafd[1] = -1; - -ga_wait_child(pid, , _err); +ga_run_command(argv, chpasswddata, "set user password", _err); if (local_err) { error_propagate(errp, local_err); -goto out; -} - -if (!WIFEXITED(status)) { -error_setg(errp, "child process has terminated abnormally"); -goto out; -} - -if (WEXITSTATUS(status)) { -error_setg(errp, "child process has failed to set user password"); -goto out; -} - -out: -g_free(chpasswddata); -g_free(rawpasswddata); -g_free(passwd_path); -if (datafd[0] != -1) { -close(datafd[0]); -} -if (datafd[1] != -1) { -close(datafd[1]); +return; } } #else /* __linux__ || __FreeBSD__ */ -- 2.39.3
[PATCH v3 3/7] qga/commands-posix: qmp_guest_shutdown: use ga_run_command helper
Also remove the G_GNUC_UNUSED attribute added in the previous commit from the helper. Signed-off-by: Andrey Drobyshev --- qga/commands-posix.c | 39 ++- 1 file changed, 6 insertions(+), 33 deletions(-) diff --git a/qga/commands-posix.c b/qga/commands-posix.c index ad05630086..d4025e0c1e 100644 --- a/qga/commands-posix.c +++ b/qga/commands-posix.c @@ -108,7 +108,6 @@ static ssize_t ga_pipe_read_str(int fd[2], char **str) * sending string to stdin and taking error message from * stdout/err. */ -G_GNUC_UNUSED static int ga_run_command(const char *argv[], const char *in_str, const char *action, Error **errp) { @@ -230,8 +229,6 @@ void qmp_guest_shutdown(const char *mode, Error **errp) { const char *shutdown_flag; Error *local_err = NULL; -pid_t pid; -int status; #ifdef CONFIG_SOLARIS const char *powerdown_flag = "-i5"; @@ -260,46 +257,22 @@ void qmp_guest_shutdown(const char *mode, Error **errp) return; } -pid = fork(); -if (pid == 0) { -/* child, start the shutdown */ -setsid(); -reopen_fd_to_null(0); -reopen_fd_to_null(1); -reopen_fd_to_null(2); - +const char *argv[] = {"/sbin/shutdown", #ifdef CONFIG_SOLARIS -execl("/sbin/shutdown", "shutdown", shutdown_flag, "-g0", "-y", - "hypervisor initiated shutdown", (char *)NULL); + shutdown_flag, "-g0", "-y", #elif defined(CONFIG_BSD) -execl("/sbin/shutdown", "shutdown", shutdown_flag, "+0", - "hypervisor initiated shutdown", (char *)NULL); + shutdown_flag, "+0", #else -execl("/sbin/shutdown", "shutdown", "-h", shutdown_flag, "+0", - "hypervisor initiated shutdown", (char *)NULL); + "-h", shutdown_flag, "+0", #endif -_exit(EXIT_FAILURE); -} else if (pid < 0) { -error_setg_errno(errp, errno, "failed to create child process"); -return; -} + "hypervisor initiated shutdown", (char *) NULL}; -ga_wait_child(pid, , _err); +ga_run_command(argv, NULL, "shutdown", _err); if (local_err) { error_propagate(errp, local_err); return; } -if (!WIFEXITED(status)) { -error_setg(errp, "child process has terminated abnormally"); -return; -} - -if (WEXITSTATUS(status)) { -error_setg(errp, "child process has failed to shutdown"); -return; -} - /* succeeded */ } -- 2.39.3
[PATCH v3 4/7] qga/commands-posix: qmp_guest_set_time: use ga_run_command helper
There's no need to check for the existence of "/sbin/hwclock", the exec() call will do that for us. Signed-off-by: Andrey Drobyshev --- qga/commands-posix.c | 43 +++ 1 file changed, 3 insertions(+), 40 deletions(-) diff --git a/qga/commands-posix.c b/qga/commands-posix.c index d4025e0c1e..94b652d54e 100644 --- a/qga/commands-posix.c +++ b/qga/commands-posix.c @@ -279,21 +279,9 @@ void qmp_guest_shutdown(const char *mode, Error **errp) void qmp_guest_set_time(bool has_time, int64_t time_ns, Error **errp) { int ret; -int status; -pid_t pid; Error *local_err = NULL; struct timeval tv; -static const char hwclock_path[] = "/sbin/hwclock"; -static int hwclock_available = -1; - -if (hwclock_available < 0) { -hwclock_available = (access(hwclock_path, X_OK) == 0); -} - -if (!hwclock_available) { -error_setg(errp, QERR_UNSUPPORTED); -return; -} +const char *argv[] = {"/sbin/hwclock", has_time ? "-w" : "-s", NULL}; /* If user has passed a time, validate and set it. */ if (has_time) { @@ -324,37 +312,12 @@ void qmp_guest_set_time(bool has_time, int64_t time_ns, Error **errp) * just need to synchronize the hardware clock. However, if no time was * passed, user is requesting the opposite: set the system time from the * hardware clock (RTC). */ -pid = fork(); -if (pid == 0) { -setsid(); -reopen_fd_to_null(0); -reopen_fd_to_null(1); -reopen_fd_to_null(2); - -/* Use '/sbin/hwclock -w' to set RTC from the system time, - * or '/sbin/hwclock -s' to set the system time from RTC. */ -execl(hwclock_path, "hwclock", has_time ? "-w" : "-s", NULL); -_exit(EXIT_FAILURE); -} else if (pid < 0) { -error_setg_errno(errp, errno, "failed to create child process"); -return; -} - -ga_wait_child(pid, , _err); +ga_run_command(argv, NULL, "set hardware clock to system time", + _err); if (local_err) { error_propagate(errp, local_err); return; } - -if (!WIFEXITED(status)) { -error_setg(errp, "child process has terminated abnormally"); -return; -} - -if (WEXITSTATUS(status)) { -error_setg(errp, "hwclock failed to set hardware clock to system time"); -return; -} } typedef enum { -- 2.39.3
[PATCH v3 2/7] qga: introduce ga_run_command() helper for guest cmd execution
When executing guest commands in *nix environment, we repeat the same fork/exec pattern multiple times. Let's just separate it into a single helper which would also be able to feed input data into the launched process' stdin. This way we can avoid code duplication. To keep the history more bisectable, let's replace qmp commands implementations one by one. Also add G_GNUC_UNUSED attribute to the helper and remove it in the next commit. Originally-by: Yuri Pudgorodskiy Signed-off-by: Andrey Drobyshev --- qga/commands-posix.c | 150 +++ 1 file changed, 150 insertions(+) diff --git a/qga/commands-posix.c b/qga/commands-posix.c index 8207c4c47e..ad05630086 100644 --- a/qga/commands-posix.c +++ b/qga/commands-posix.c @@ -76,6 +76,156 @@ static void ga_wait_child(pid_t pid, int *status, Error **errp) g_assert(rpid == pid); } +static ssize_t ga_pipe_read_str(int fd[2], char **str) +{ +ssize_t n, len = 0; +char buf[1024]; + +close(fd[1]); +fd[1] = -1; +while ((n = read(fd[0], buf, sizeof(buf))) != 0) { +if (n < 0) { +if (errno == EINTR) { +continue; +} else { +len = -errno; +break; +} +} +*str = g_realloc(*str, len + n + 1); +memcpy(*str + len, buf, n); +len += n; +*str[len] = '\0'; +} +close(fd[0]); +fd[0] = -1; + +return len; +} + +/* + * Helper to run command with input/output redirection, + * sending string to stdin and taking error message from + * stdout/err. + */ +G_GNUC_UNUSED +static int ga_run_command(const char *argv[], const char *in_str, + const char *action, Error **errp) +{ +pid_t pid; +int status; +int retcode = -1; +int infd[2] = { -1, -1 }; +int outfd[2] = { -1, -1 }; +char *str = NULL; +ssize_t len = 0; + +if ((in_str && !g_unix_open_pipe(infd, FD_CLOEXEC, NULL)) || +!g_unix_open_pipe(outfd, FD_CLOEXEC, NULL)) { +error_setg(errp, "cannot create pipe FDs"); +goto out; +} + +pid = fork(); +if (pid == 0) { +char *cherr = NULL; + +setsid(); + +if (in_str) { +/* Redirect stdin to infd. */ +close(infd[1]); +dup2(infd[0], 0); +close(infd[0]); +} else { +reopen_fd_to_null(0); +} + +/* Redirect stdout/stderr to outfd. */ +close(outfd[0]); +dup2(outfd[1], 1); +dup2(outfd[1], 2); +close(outfd[1]); + +execvp(argv[0], (char *const *)argv); + +/* Write the cause of failed exec to pipe for the parent to read it. */ +cherr = g_strdup_printf("failed to exec '%s'", argv[0]); +perror(cherr); +g_free(cherr); +_exit(EXIT_FAILURE); +} else if (pid < 0) { +error_setg_errno(errp, errno, "failed to create child process"); +goto out; +} + +if (in_str) { +close(infd[0]); +infd[0] = -1; +if (qemu_write_full(infd[1], in_str, strlen(in_str)) != +strlen(in_str)) { +error_setg_errno(errp, errno, "%s: cannot write to stdin pipe", + action); +goto out; +} +close(infd[1]); +infd[1] = -1; +} + +len = ga_pipe_read_str(outfd, ); +if (len < 0) { +error_setg_errno(errp, -len, "%s: cannot read from stdout/stderr pipe", + action); +goto out; +} + +ga_wait_child(pid, , errp); +if (*errp) { +goto out; +} + +if (!WIFEXITED(status)) { +if (len) { +error_setg(errp, "child process has terminated abnormally: %s", + str); +} else { +error_setg(errp, "child process has terminated abnormally"); +} +goto out; +} + +retcode = WEXITSTATUS(status); + +if (WEXITSTATUS(status)) { +if (len) { +error_setg(errp, "child process has failed to %s: %s", + action, str); +} else { +error_setg(errp, "child process has failed to %s: exit status %d", + action, WEXITSTATUS(status)); +} +goto out; +} + +out: +g_free(str); + +if (infd[0] != -1) { +close(infd[0]); +} +if (infd[1] != -1) { +close(infd[1]); +} +if (outfd[0] != -1) { +close(outfd[0]); +} +if (outfd[1] != -1) { +close(outfd[1]); +} + +return retcode; +} + void qmp_guest_shutdown(const char *mode, Error **errp) { const char *shutdown_flag; -- 2.39.3
[PATCH v3 6/7] qga/commands-posix: don't do fork()/exec() when suspending via sysfs
Since commit 246d76eba ("qga: guest_suspend: decoupling pm-utils and sys logic") pm-utils logic is running in a separate child from the sysfs logic. Now when suspending via sysfs we don't really need to do that in a separate process as we only need to perform one write to /sys/power/state. Let's just use g_file_set_contents() to simplify things here. Suggested-by: Daniel P. Berrangé Signed-off-by: Andrey Drobyshev --- qga/commands-posix.c | 41 + 1 file changed, 5 insertions(+), 36 deletions(-) diff --git a/qga/commands-posix.c b/qga/commands-posix.c index 610d225d30..e0ea377f65 100644 --- a/qga/commands-posix.c +++ b/qga/commands-posix.c @@ -1928,52 +1928,21 @@ static bool linux_sys_state_supports_mode(SuspendMode mode, Error **errp) static void linux_sys_state_suspend(SuspendMode mode, Error **errp) { -Error *local_err = NULL; +GError *local_gerr = NULL; const char *sysfile_strs[3] = {"disk", "mem", NULL}; const char *sysfile_str = sysfile_strs[mode]; -pid_t pid; -int status; if (!sysfile_str) { error_setg(errp, "unknown guest suspend mode"); return; } -pid = fork(); -if (!pid) { -/* child */ -int fd; - -setsid(); -reopen_fd_to_null(0); -reopen_fd_to_null(1); -reopen_fd_to_null(2); - -fd = open(LINUX_SYS_STATE_FILE, O_WRONLY); -if (fd < 0) { -_exit(EXIT_FAILURE); -} - -if (write(fd, sysfile_str, strlen(sysfile_str)) < 0) { -_exit(EXIT_FAILURE); -} - -_exit(EXIT_SUCCESS); -} else if (pid < 0) { -error_setg_errno(errp, errno, "failed to create child process"); -return; -} - -ga_wait_child(pid, , _err); -if (local_err) { -error_propagate(errp, local_err); +if (!g_file_set_contents(LINUX_SYS_STATE_FILE, sysfile_str, + -1, _gerr)) { +error_setg(errp, "suspend: cannot write to '%s': %s", + LINUX_SYS_STATE_FILE, local_gerr->message); return; } - -if (WEXITSTATUS(status)) { -error_setg(errp, "child process has failed to suspend"); -} - } static void guest_suspend(SuspendMode mode, Error **errp) -- 2.39.3
[PATCH v3 0/7] qga/commands-posix: replace code duplicating commands with a helper
v2 -> v3: * Patch 2/7: - ga_pipe_read_str() helper now returns -errno in case of an error during read from pipe, so that the caller may use it to set error_setg_errno(); - ga_pipe_read_str() allocates +1 additional byte to make the string read from pipe null-terminated on every iteration; * Patch 6/7: patch is rewritten to completely get rid of fork()/exec() when suspending via sysfs, it now simply uses g_file_set_contents() (suggested by Daniel); * Patch 7/7: cosmetic change: removed unneeded brackets in an expression. v2: https://lists.nongnu.org/archive/html/qemu-devel/2024-03/msg00147.html Andrey Drobyshev (7): qga: guest-get-fsinfo: add optional 'total-bytes-root' field qga: introduce ga_run_command() helper for guest cmd execution qga/commands-posix: qmp_guest_shutdown: use ga_run_command helper qga/commands-posix: qmp_guest_set_time: use ga_run_command helper qga/commands-posix: execute_fsfreeze_hook: use ga_run_command helper qga/commands-posix: don't do fork()/exec() when suspending via sysfs qga/commands-posix: qmp_guest_set_user_password: use ga_run_command helper qga/commands-posix.c | 404 +++ qga/commands-win32.c | 1 + qga/qapi-schema.json | 12 +- 3 files changed, 193 insertions(+), 224 deletions(-) -- 2.39.3
[PATCH v3 1/7] qga: guest-get-fsinfo: add optional 'total-bytes-root' field
Since the commit 25b5ff1a86 ("qga: add mountpoint usage info to GuestFilesystemInfo") we have 2 values reported in guest-get-fsinfo: used = (f_blocks - f_bfree), total = (f_blocks - f_bfree + f_bavail) as returned by statvfs(3). While on Windows guests that's all we can get with GetDiskFreeSpaceExA(), on POSIX guests we might also be interested in total file system size, as it's visible for root user. Let's add an optional field 'total-bytes-root' to GuestFilesystemInfo struct, which'd only be reported on POSIX and represent f_blocks value as returned by statvfs(3). While here, let's document better where those values come from in both POSIX and Windows. Signed-off-by: Andrey Drobyshev --- qga/commands-posix.c | 2 ++ qga/commands-win32.c | 1 + qga/qapi-schema.json | 12 +++- 3 files changed, 14 insertions(+), 1 deletion(-) diff --git a/qga/commands-posix.c b/qga/commands-posix.c index 26008db497..8207c4c47e 100644 --- a/qga/commands-posix.c +++ b/qga/commands-posix.c @@ -1569,8 +1569,10 @@ static GuestFilesystemInfo *build_guest_fsinfo(struct FsMount *mount, nonroot_total = used + buf.f_bavail; fs->used_bytes = used * fr_size; fs->total_bytes = nonroot_total * fr_size; +fs->total_bytes_root = buf.f_blocks * fr_size; fs->has_total_bytes = true; +fs->has_total_bytes_root = true; fs->has_used_bytes = true; } diff --git a/qga/commands-win32.c b/qga/commands-win32.c index a1015757d8..9e820aad8d 100644 --- a/qga/commands-win32.c +++ b/qga/commands-win32.c @@ -1143,6 +1143,7 @@ static GuestFilesystemInfo *build_guest_fsinfo(char *guid, Error **errp) fs = g_malloc(sizeof(*fs)); fs->name = g_strdup(guid); fs->has_total_bytes = false; +fs->has_total_bytes_root = false; fs->has_used_bytes = false; if (len == 0) { fs->mountpoint = g_strdup("System Reserved"); diff --git a/qga/qapi-schema.json b/qga/qapi-schema.json index b8efe31897..093a5ab602 100644 --- a/qga/qapi-schema.json +++ b/qga/qapi-schema.json @@ -1031,8 +1031,18 @@ # @type: file system type string # # @used-bytes: file system used bytes (since 3.0) +# * POSIX: (f_blocks - f_bfree) * f_frsize, as returned by statvfs(3) +# * Windows: (TotalNumberOfBytes - TotalNumberOfFreeBytes), as returned +# by GetDiskFreeSpaceEx() # # @total-bytes: non-root file system total bytes (since 3.0) +# * POSIX: (f_blocks - f_bfree + f_bavail) * f_frsize, as returned by +# statvfs(3) +# * Windows: TotalNumberOfBytes, as returned by GetDiskFreeSpaceEx() +# +# @total-bytes-root: total file system size in bytes (as visible for a +# priviledged user) (since 8.3) +# * POSIX only: (f_blocks * f_frsize), returned by statvfs(3) # # @disk: an array of disk hardware information that the volume lies # on, which may be empty if the disk type is not supported @@ -1042,7 +1052,7 @@ { 'struct': 'GuestFilesystemInfo', 'data': {'name': 'str', 'mountpoint': 'str', 'type': 'str', '*used-bytes': 'uint64', '*total-bytes': 'uint64', - 'disk': ['GuestDiskAddress']} } + '*total-bytes-root': 'uint64', 'disk': ['GuestDiskAddress']} } ## # @guest-get-fsinfo: -- 2.39.3
Re: [PATCH v2 6/7] qga/commands-posix: use ga_run_command helper when suspending via sysfs
On 3/5/24 20:34, Daniel P. Berrangé wrote: > [Вы нечасто получаете письма от berra...@redhat.com. Узнайте, почему это > важно, по адресу https://aka.ms/LearnAboutSenderIdentification ] > > On Fri, Mar 01, 2024 at 07:28:57PM +0200, Andrey Drobyshev wrote: >> We replace the direct call to open() with a "sh -c 'echo ...'" call, so >> that it becomes an executable command. > > Introduced an indirection via the shell is a significant step > backwards IMHO. > >> >> Signed-off-by: Andrey Drobyshev >> --- >> qga/commands-posix.c | 36 >> 1 file changed, 4 insertions(+), 32 deletions(-) >> >> diff --git a/qga/commands-posix.c b/qga/commands-posix.c >> index dd2a7ad2e6..f3f4a05e2d 100644 >> --- a/qga/commands-posix.c >> +++ b/qga/commands-posix.c >> @@ -1921,49 +1921,21 @@ static void linux_sys_state_suspend(SuspendMode >> mode, Error **errp) >> Error *local_err = NULL; >> const char *sysfile_strs[3] = {"disk", "mem", NULL}; >> const char *sysfile_str = sysfile_strs[mode]; >> -pid_t pid; >> -int status; >> >> if (!sysfile_str) { >> error_setg(errp, "unknown guest suspend mode"); >> return; >> } >> >> -pid = fork(); >> -if (!pid) { >> -/* child */ >> -int fd; >> - >> -setsid(); >> -reopen_fd_to_null(0); >> -reopen_fd_to_null(1); >> -reopen_fd_to_null(2); >> - >> -fd = open(LINUX_SYS_STATE_FILE, O_WRONLY); >> -if (fd < 0) { >> -_exit(EXIT_FAILURE); >> -} >> - >> -if (write(fd, sysfile_str, strlen(sysfile_str)) < 0) { >> -_exit(EXIT_FAILURE); >> -} >> - >> -_exit(EXIT_SUCCESS); > > > This pre-existing code is strange to me. Why do we need to fork a > new process in order to write to /sys/power/state ? > > Looking at the original commit > > commit 11d0f1255bd5651f628280dc96c4ce9d63ae9236 > Author: Luiz Capitulino > Date: Tue Feb 28 11:03:03 2012 -0300 > > qemu-ga: add guest-suspend-disk > > > The code made a little more sense, as after fork() it first > tried to execve 'pm-utils', and then had the sysfs codepath > as a fallback. IOW having the sysfs code after fork() was a > much easier code structure. > > This was all changed in > > commit 246d76eba1944d7e59affb288ec27d7fcfb5d256 > Author: Daniel Henrique Barboza > Date: Thu Jun 21 07:21:50 2018 -0300 > > qga: guest_suspend: decoupling pm-utils and sys logic > > > so the pm-utils logic runs in a separate forked child > from the sysfs logic. > > AFAICT, that should have made it completely redundant to > fork a process to access /sys/power/state. > > Does anyone know of a reason to keep the fork() here ? Of > not we should just be calling 'g_file_set_contents' without > fork > In the original commit message Luiz Capitulino explained that multiple forks are needed to properly reap child processes needed for execl(). AFAIU since writing to /sys/power/state doesn't require execl(), fork() isn't needed either. I think the 2nd commit you mention simply kept things as they were since it wasn't the original goal of the patch. So we're just looking at legacy code. I agree that in this case my original patch is redundant and we may replace it with smth like g_file_set_contents(). I'll add it in v3. >> -} else if (pid < 0) { >> -error_setg_errno(errp, errno, "failed to create child process"); >> -return; >> -} >> +g_autofree char *echo_cmd = g_strdup_printf( >> +"echo %s > " LINUX_SYS_STATE_FILE, sysfile_str); >> +const char *argv[] = {"sh", "-c", echo_cmd, NULL}; >> >> -ga_wait_child(pid, , _err); >> +ga_run_command(argv, NULL, "suspend", _err); >> if (local_err) { >> error_propagate(errp, local_err); >> return; >> } >> - >> -if (WEXITSTATUS(status)) { >> -error_setg(errp, "child process has failed to suspend"); >> -} >> - >> } >> >> static void guest_suspend(SuspendMode mode, Error **errp) >> -- >> 2.39.3 >> >> > > With regards, > Daniel > -- > |: https://berrange.com -o-https://www.flickr.com/photos/dberrange :| > |: https://libvirt.org -o-https://fstop138.berrange.com :| > |: https://entangle-photo.org-o-https://www.instagram.com/dberrange :| >
Re: [PATCH v2 2/7] qga: introduce ga_run_command() helper for guest cmd execution
On 3/5/24 19:58, Daniel P. Berrangé wrote: > On Fri, Mar 01, 2024 at 07:28:53PM +0200, Andrey Drobyshev wrote: >> When executing guest commands in *nix environment, we repeat the same >> fork/exec pattern multiple times. Let's just separate it into a single >> helper which would also be able to feed input data into the launched >> process' stdin. This way we can avoid code duplication. >> >> To keep the history more bisectable, let's replace qmp commands >> implementations one by one. Also add G_GNUC_UNUSED attribute to the >> helper and remove it in the next commit. >> >> Originally-by: Yuri Pudgorodskiy >> Signed-off-by: Andrey Drobyshev >> --- >> qga/commands-posix.c | 140 +++ >> 1 file changed, 140 insertions(+) >> >> diff --git a/qga/commands-posix.c b/qga/commands-posix.c >> index 8207c4c47e..781498418f 100644 >> --- a/qga/commands-posix.c >> +++ b/qga/commands-posix.c >> @@ -76,6 +76,146 @@ static void ga_wait_child(pid_t pid, int *status, Error >> **errp) >> g_assert(rpid == pid); >> } >> >> +static void ga_pipe_read_str(int fd[2], char **str, size_t *len) >> +{ >> +ssize_t n; >> +char buf[1024]; >> +close(fd[1]); >> +fd[1] = -1; >> +while ((n = read(fd[0], buf, sizeof(buf))) != 0) { >> +if (n < 0) { >> +if (errno == EINTR) { >> +continue; >> +} else { >> +break; > > This is a fatal error condition > >> +} >> +} >> +*str = g_realloc(*str, *len + n); >> +memcpy(*str + *len, buf, n); >> +*len += n; >> +} >> +close(fd[0]); >> +fd[0] = -1; > > and yet as far as the caller is concerned we're not indicating > any sense of failure here. They're just being returned a partially > read data stream as if it were successful. IMHO this needs to be > reported properly. > Agreed. We might make this helper return -errno in case of an erroneous read from pipe, check the value in the caller and do error_setg_errno() if smth went wrong. > > Nothing in this method has NUL terminated 'str', so we are > relying on the caller *always* honouring 'len', but. > Agreed as well. Let's allocate +1 additional byte and store '\0' in there on every iteration, making sure our result is always null-terminated. That way we won't need to check 'len' in the caller. >> +} >> + >> +/* >> + * Helper to run command with input/output redirection, >> + * sending string to stdin and taking error message from >> + * stdout/err. >> + */ >> +G_GNUC_UNUSED >> +static int ga_run_command(const char *argv[], const char *in_str, >> + const char *action, Error **errp) >> +{ >> +pid_t pid; >> +int status; >> +int retcode = -1; >> +int infd[2] = { -1, -1 }; >> +int outfd[2] = { -1, -1 }; >> +char *str = NULL; >> +size_t len = 0; >> + >> +if ((in_str && !g_unix_open_pipe(infd, FD_CLOEXEC, NULL)) || >> +!g_unix_open_pipe(outfd, FD_CLOEXEC, NULL)) { >> +error_setg(errp, "cannot create pipe FDs"); >> +goto out; >> +} >> + >> +pid = fork(); >> +if (pid == 0) { >> +char *cherr = NULL; >> + >> +setsid(); >> + >> +if (in_str) { >> +/* Redirect stdin to infd. */ >> +close(infd[1]); >> +dup2(infd[0], 0); >> +close(infd[0]); >> +} else { >> +reopen_fd_to_null(0); >> +} >> + >> +/* Redirect stdout/stderr to outfd. */ >> +close(outfd[0]); >> +dup2(outfd[1], 1); >> +dup2(outfd[1], 2); >> +close(outfd[1]); >> + >> +execvp(argv[0], (char *const *)argv); >> + >> +/* Write the cause of failed exec to pipe for the parent to read >> it. */ >> +cherr = g_strdup_printf("failed to exec '%s'", argv[0]); >> +perror(cherr); >> +g_free(cherr); >> +_exit(EXIT_FAILURE); >> +} else if (pid < 0) { >> +error_setg_errno(errp, errno, "failed to create child process"); >> +goto out; >> +} >> + >> +if (in_str) { >> +close(infd[0]); >> +infd[0] = -1; >> +if (qemu_write_full(infd[1], in_str, strlen(in_str)) != >> +
Re: [PATCH v2 7/7] qga/commands-posix: qmp_guest_set_user_password: use ga_run_command helper
On 3/5/24 20:38, Daniel P. Berrangé wrote: > On Fri, Mar 01, 2024 at 07:28:58PM +0200, Andrey Drobyshev wrote: >> There's no need to check for the existence of the "chpasswd", "pw" >> executables, as the exec() call will do that for us. >> >> Signed-off-by: Andrey Drobyshev >> --- >> qga/commands-posix.c | 96 ++-- >> 1 file changed, 13 insertions(+), 83 deletions(-) >> >> diff --git a/qga/commands-posix.c b/qga/commands-posix.c >> index f3f4a05e2d..f2e9496b80 100644 >> --- a/qga/commands-posix.c >> +++ b/qga/commands-posix.c >> @@ -2144,14 +2144,8 @@ void qmp_guest_set_user_password(const char *username, >> Error **errp) >> { >> Error *local_err = NULL; >> -char *passwd_path = NULL; >> -pid_t pid; >> -int status; >> -int datafd[2] = { -1, -1 }; >> -char *rawpasswddata = NULL; >> +g_autofree char *rawpasswddata = NULL; >> size_t rawpasswdlen; >> -char *chpasswddata = NULL; >> -size_t chpasswdlen; >> >> rawpasswddata = (char *)qbase64_decode(password, -1, , >> errp); >> if (!rawpasswddata) { >> @@ -2162,95 +2156,31 @@ void qmp_guest_set_user_password(const char >> *username, >> >> if (strchr(rawpasswddata, '\n')) { >> error_setg(errp, "forbidden characters in raw password"); >> -goto out; >> +return; >> } >> >> if (strchr(username, '\n') || >> strchr(username, ':')) { >> error_setg(errp, "forbidden characters in username"); >> -goto out; >> +return; >> } >> >> #ifdef __FreeBSD__ >> -chpasswddata = g_strdup(rawpasswddata); >> -passwd_path = g_find_program_in_path("pw"); >> +g_autofree char *chpasswdata = g_strdup(rawpasswddata); >> +const char *crypt_flag = (crypted) ? "-H" : "-h"; >> +const char *argv[] = {"pw", "usermod", "-n", username, >> + crypt_flag, "0", NULL}; >> #else >> -chpasswddata = g_strdup_printf("%s:%s\n", username, rawpasswddata); >> -passwd_path = g_find_program_in_path("chpasswd"); >> +g_autofree char *chpasswddata = g_strdup_printf("%s:%s\n", username, >> +rawpasswddata); >> +const char *crypt_flag = (crypted) ? "-e" : NULL; > > Style nit-pick - no '(...)' around 'crypted' is needed here, or > the other place later in this method. > > Yes, that was a pre-existing issue, but since you're refactoring > the code, might as well kill the redundant brackets. > > [...] Sure, let's get rid of them. Thanks.
[PATCH v2 4/7] qga/commands-posix: qmp_guest_set_time: use ga_run_command helper
There's no need to check for the existence of "/sbin/hwclock", the exec() call will do that for us. Signed-off-by: Andrey Drobyshev --- qga/commands-posix.c | 43 +++ 1 file changed, 3 insertions(+), 40 deletions(-) diff --git a/qga/commands-posix.c b/qga/commands-posix.c index 003054891f..e44203a273 100644 --- a/qga/commands-posix.c +++ b/qga/commands-posix.c @@ -269,21 +269,9 @@ void qmp_guest_shutdown(const char *mode, Error **errp) void qmp_guest_set_time(bool has_time, int64_t time_ns, Error **errp) { int ret; -int status; -pid_t pid; Error *local_err = NULL; struct timeval tv; -static const char hwclock_path[] = "/sbin/hwclock"; -static int hwclock_available = -1; - -if (hwclock_available < 0) { -hwclock_available = (access(hwclock_path, X_OK) == 0); -} - -if (!hwclock_available) { -error_setg(errp, QERR_UNSUPPORTED); -return; -} +const char *argv[] = {"/sbin/hwclock", has_time ? "-w" : "-s", NULL}; /* If user has passed a time, validate and set it. */ if (has_time) { @@ -314,37 +302,12 @@ void qmp_guest_set_time(bool has_time, int64_t time_ns, Error **errp) * just need to synchronize the hardware clock. However, if no time was * passed, user is requesting the opposite: set the system time from the * hardware clock (RTC). */ -pid = fork(); -if (pid == 0) { -setsid(); -reopen_fd_to_null(0); -reopen_fd_to_null(1); -reopen_fd_to_null(2); - -/* Use '/sbin/hwclock -w' to set RTC from the system time, - * or '/sbin/hwclock -s' to set the system time from RTC. */ -execl(hwclock_path, "hwclock", has_time ? "-w" : "-s", NULL); -_exit(EXIT_FAILURE); -} else if (pid < 0) { -error_setg_errno(errp, errno, "failed to create child process"); -return; -} - -ga_wait_child(pid, , _err); +ga_run_command(argv, NULL, "set hardware clock to system time", + _err); if (local_err) { error_propagate(errp, local_err); return; } - -if (!WIFEXITED(status)) { -error_setg(errp, "child process has terminated abnormally"); -return; -} - -if (WEXITSTATUS(status)) { -error_setg(errp, "hwclock failed to set hardware clock to system time"); -return; -} } typedef enum { -- 2.39.3
[PATCH v2 3/7] qga/commands-posix: qmp_guest_shutdown: use ga_run_command helper
Also remove the G_GNUC_UNUSED attribute added in the previous commit from the helper. Signed-off-by: Andrey Drobyshev --- qga/commands-posix.c | 39 ++- 1 file changed, 6 insertions(+), 33 deletions(-) diff --git a/qga/commands-posix.c b/qga/commands-posix.c index 781498418f..003054891f 100644 --- a/qga/commands-posix.c +++ b/qga/commands-posix.c @@ -103,7 +103,6 @@ static void ga_pipe_read_str(int fd[2], char **str, size_t *len) * sending string to stdin and taking error message from * stdout/err. */ -G_GNUC_UNUSED static int ga_run_command(const char *argv[], const char *in_str, const char *action, Error **errp) { @@ -220,8 +219,6 @@ void qmp_guest_shutdown(const char *mode, Error **errp) { const char *shutdown_flag; Error *local_err = NULL; -pid_t pid; -int status; #ifdef CONFIG_SOLARIS const char *powerdown_flag = "-i5"; @@ -250,46 +247,22 @@ void qmp_guest_shutdown(const char *mode, Error **errp) return; } -pid = fork(); -if (pid == 0) { -/* child, start the shutdown */ -setsid(); -reopen_fd_to_null(0); -reopen_fd_to_null(1); -reopen_fd_to_null(2); - +const char *argv[] = {"/sbin/shutdown", #ifdef CONFIG_SOLARIS -execl("/sbin/shutdown", "shutdown", shutdown_flag, "-g0", "-y", - "hypervisor initiated shutdown", (char *)NULL); + shutdown_flag, "-g0", "-y", #elif defined(CONFIG_BSD) -execl("/sbin/shutdown", "shutdown", shutdown_flag, "+0", - "hypervisor initiated shutdown", (char *)NULL); + shutdown_flag, "+0", #else -execl("/sbin/shutdown", "shutdown", "-h", shutdown_flag, "+0", - "hypervisor initiated shutdown", (char *)NULL); + "-h", shutdown_flag, "+0", #endif -_exit(EXIT_FAILURE); -} else if (pid < 0) { -error_setg_errno(errp, errno, "failed to create child process"); -return; -} + "hypervisor initiated shutdown", (char *) NULL}; -ga_wait_child(pid, , _err); +ga_run_command(argv, NULL, "shutdown", _err); if (local_err) { error_propagate(errp, local_err); return; } -if (!WIFEXITED(status)) { -error_setg(errp, "child process has terminated abnormally"); -return; -} - -if (WEXITSTATUS(status)) { -error_setg(errp, "child process has failed to shutdown"); -return; -} - /* succeeded */ } -- 2.39.3
[PATCH v2 0/7] qga/commands-posix: replace code duplicating commands with a helper
v1 --> v2: * Replace commit for guest-get-fsinfo: instead of reporting statvfs(3) values directly, keep the old ones but add an additional optional field 'total-bytes-root', only reported in POSIX. Also tweak the fields description in qga/qapi-schema.json to document where the values come from. v1: https://lists.nongnu.org/archive/html/qemu-devel/2024-02/msg05766.html Andrey Drobyshev (7): qga: guest-get-fsinfo: add optional 'total-bytes-root' field qga: introduce ga_run_command() helper for guest cmd execution qga/commands-posix: qmp_guest_shutdown: use ga_run_command helper qga/commands-posix: qmp_guest_set_time: use ga_run_command helper qga/commands-posix: execute_fsfreeze_hook: use ga_run_command helper qga/commands-posix: use ga_run_command helper when suspending via sysfs qga/commands-posix: qmp_guest_set_user_password: use ga_run_command helper qga/commands-posix.c | 389 +++ qga/commands-win32.c | 1 + qga/qapi-schema.json | 12 +- 3 files changed, 182 insertions(+), 220 deletions(-) -- 2.39.3
[PATCH v2 2/7] qga: introduce ga_run_command() helper for guest cmd execution
When executing guest commands in *nix environment, we repeat the same fork/exec pattern multiple times. Let's just separate it into a single helper which would also be able to feed input data into the launched process' stdin. This way we can avoid code duplication. To keep the history more bisectable, let's replace qmp commands implementations one by one. Also add G_GNUC_UNUSED attribute to the helper and remove it in the next commit. Originally-by: Yuri Pudgorodskiy Signed-off-by: Andrey Drobyshev --- qga/commands-posix.c | 140 +++ 1 file changed, 140 insertions(+) diff --git a/qga/commands-posix.c b/qga/commands-posix.c index 8207c4c47e..781498418f 100644 --- a/qga/commands-posix.c +++ b/qga/commands-posix.c @@ -76,6 +76,146 @@ static void ga_wait_child(pid_t pid, int *status, Error **errp) g_assert(rpid == pid); } +static void ga_pipe_read_str(int fd[2], char **str, size_t *len) +{ +ssize_t n; +char buf[1024]; +close(fd[1]); +fd[1] = -1; +while ((n = read(fd[0], buf, sizeof(buf))) != 0) { +if (n < 0) { +if (errno == EINTR) { +continue; +} else { +break; +} +} +*str = g_realloc(*str, *len + n); +memcpy(*str + *len, buf, n); +*len += n; +} +close(fd[0]); +fd[0] = -1; +} + +/* + * Helper to run command with input/output redirection, + * sending string to stdin and taking error message from + * stdout/err. + */ +G_GNUC_UNUSED +static int ga_run_command(const char *argv[], const char *in_str, + const char *action, Error **errp) +{ +pid_t pid; +int status; +int retcode = -1; +int infd[2] = { -1, -1 }; +int outfd[2] = { -1, -1 }; +char *str = NULL; +size_t len = 0; + +if ((in_str && !g_unix_open_pipe(infd, FD_CLOEXEC, NULL)) || +!g_unix_open_pipe(outfd, FD_CLOEXEC, NULL)) { +error_setg(errp, "cannot create pipe FDs"); +goto out; +} + +pid = fork(); +if (pid == 0) { +char *cherr = NULL; + +setsid(); + +if (in_str) { +/* Redirect stdin to infd. */ +close(infd[1]); +dup2(infd[0], 0); +close(infd[0]); +} else { +reopen_fd_to_null(0); +} + +/* Redirect stdout/stderr to outfd. */ +close(outfd[0]); +dup2(outfd[1], 1); +dup2(outfd[1], 2); +close(outfd[1]); + +execvp(argv[0], (char *const *)argv); + +/* Write the cause of failed exec to pipe for the parent to read it. */ +cherr = g_strdup_printf("failed to exec '%s'", argv[0]); +perror(cherr); +g_free(cherr); +_exit(EXIT_FAILURE); +} else if (pid < 0) { +error_setg_errno(errp, errno, "failed to create child process"); +goto out; +} + +if (in_str) { +close(infd[0]); +infd[0] = -1; +if (qemu_write_full(infd[1], in_str, strlen(in_str)) != +strlen(in_str)) { +error_setg_errno(errp, errno, "%s: cannot write to stdin pipe", + action); +goto out; +} +close(infd[1]); +infd[1] = -1; +} + +ga_pipe_read_str(outfd, , ); + +ga_wait_child(pid, , errp); +if (*errp) { +goto out; +} + +if (!WIFEXITED(status)) { +if (len) { +error_setg(errp, "child process has terminated abnormally: %s", + str); +} else { +error_setg(errp, "child process has terminated abnormally"); +} +goto out; +} + +retcode = WEXITSTATUS(status); + +if (WEXITSTATUS(status)) { +if (len) { +error_setg(errp, "child process has failed to %s: %s", + action, str); +} else { +error_setg(errp, "child process has failed to %s: exit status %d", + action, WEXITSTATUS(status)); +} +goto out; +} + +out: +g_free(str); + +if (infd[0] != -1) { +close(infd[0]); +} +if (infd[1] != -1) { +close(infd[1]); +} +if (outfd[0] != -1) { +close(outfd[0]); +} +if (outfd[1] != -1) { +close(outfd[1]); +} + +return retcode; +} + void qmp_guest_shutdown(const char *mode, Error **errp) { const char *shutdown_flag; -- 2.39.3
[PATCH v2 7/7] qga/commands-posix: qmp_guest_set_user_password: use ga_run_command helper
There's no need to check for the existence of the "chpasswd", "pw" executables, as the exec() call will do that for us. Signed-off-by: Andrey Drobyshev --- qga/commands-posix.c | 96 ++-- 1 file changed, 13 insertions(+), 83 deletions(-) diff --git a/qga/commands-posix.c b/qga/commands-posix.c index f3f4a05e2d..f2e9496b80 100644 --- a/qga/commands-posix.c +++ b/qga/commands-posix.c @@ -2144,14 +2144,8 @@ void qmp_guest_set_user_password(const char *username, Error **errp) { Error *local_err = NULL; -char *passwd_path = NULL; -pid_t pid; -int status; -int datafd[2] = { -1, -1 }; -char *rawpasswddata = NULL; +g_autofree char *rawpasswddata = NULL; size_t rawpasswdlen; -char *chpasswddata = NULL; -size_t chpasswdlen; rawpasswddata = (char *)qbase64_decode(password, -1, , errp); if (!rawpasswddata) { @@ -2162,95 +2156,31 @@ void qmp_guest_set_user_password(const char *username, if (strchr(rawpasswddata, '\n')) { error_setg(errp, "forbidden characters in raw password"); -goto out; +return; } if (strchr(username, '\n') || strchr(username, ':')) { error_setg(errp, "forbidden characters in username"); -goto out; +return; } #ifdef __FreeBSD__ -chpasswddata = g_strdup(rawpasswddata); -passwd_path = g_find_program_in_path("pw"); +g_autofree char *chpasswdata = g_strdup(rawpasswddata); +const char *crypt_flag = (crypted) ? "-H" : "-h"; +const char *argv[] = {"pw", "usermod", "-n", username, + crypt_flag, "0", NULL}; #else -chpasswddata = g_strdup_printf("%s:%s\n", username, rawpasswddata); -passwd_path = g_find_program_in_path("chpasswd"); +g_autofree char *chpasswddata = g_strdup_printf("%s:%s\n", username, +rawpasswddata); +const char *crypt_flag = (crypted) ? "-e" : NULL; +const char *argv[] = {"chpasswd", crypt_flag, NULL}; #endif -chpasswdlen = strlen(chpasswddata); - -if (!passwd_path) { -error_setg(errp, "cannot find 'passwd' program in PATH"); -goto out; -} - -if (!g_unix_open_pipe(datafd, FD_CLOEXEC, NULL)) { -error_setg(errp, "cannot create pipe FDs"); -goto out; -} - -pid = fork(); -if (pid == 0) { -close(datafd[1]); -/* child */ -setsid(); -dup2(datafd[0], 0); -reopen_fd_to_null(1); -reopen_fd_to_null(2); - -#ifdef __FreeBSD__ -const char *h_arg; -h_arg = (crypted) ? "-H" : "-h"; -execl(passwd_path, "pw", "usermod", "-n", username, h_arg, "0", NULL); -#else -if (crypted) { -execl(passwd_path, "chpasswd", "-e", NULL); -} else { -execl(passwd_path, "chpasswd", NULL); -} -#endif -_exit(EXIT_FAILURE); -} else if (pid < 0) { -error_setg_errno(errp, errno, "failed to create child process"); -goto out; -} -close(datafd[0]); -datafd[0] = -1; - -if (qemu_write_full(datafd[1], chpasswddata, chpasswdlen) != chpasswdlen) { -error_setg_errno(errp, errno, "cannot write new account password"); -goto out; -} -close(datafd[1]); -datafd[1] = -1; - -ga_wait_child(pid, , _err); +ga_run_command(argv, chpasswddata, "set user password", _err); if (local_err) { error_propagate(errp, local_err); -goto out; -} - -if (!WIFEXITED(status)) { -error_setg(errp, "child process has terminated abnormally"); -goto out; -} - -if (WEXITSTATUS(status)) { -error_setg(errp, "child process has failed to set user password"); -goto out; -} - -out: -g_free(chpasswddata); -g_free(rawpasswddata); -g_free(passwd_path); -if (datafd[0] != -1) { -close(datafd[0]); -} -if (datafd[1] != -1) { -close(datafd[1]); +return; } } #else /* __linux__ || __FreeBSD__ */ -- 2.39.3
[PATCH v2 1/7] qga: guest-get-fsinfo: add optional 'total-bytes-root' field
Since the commit 25b5ff1a86 ("qga: add mountpoint usage info to GuestFilesystemInfo") we have 2 values reported in guest-get-fsinfo: used = (f_blocks - f_bfree), total = (f_blocks - f_bfree + f_bavail) as returned by statvfs(3). While on Windows guests that's all we can get with GetDiskFreeSpaceExA(), on POSIX guests we might also be interested in total file system size, as it's visible for root user. Let's add an optional field 'total-bytes-root' to GuestFilesystemInfo struct, which'd only be reported on POSIX and represent f_blocks value as returned by statvfs(3). While here, let's document better where those values come from in both POSIX and Windows. Signed-off-by: Andrey Drobyshev --- qga/commands-posix.c | 2 ++ qga/commands-win32.c | 1 + qga/qapi-schema.json | 12 +++- 3 files changed, 14 insertions(+), 1 deletion(-) diff --git a/qga/commands-posix.c b/qga/commands-posix.c index 26008db497..8207c4c47e 100644 --- a/qga/commands-posix.c +++ b/qga/commands-posix.c @@ -1569,8 +1569,10 @@ static GuestFilesystemInfo *build_guest_fsinfo(struct FsMount *mount, nonroot_total = used + buf.f_bavail; fs->used_bytes = used * fr_size; fs->total_bytes = nonroot_total * fr_size; +fs->total_bytes_root = buf.f_blocks * fr_size; fs->has_total_bytes = true; +fs->has_total_bytes_root = true; fs->has_used_bytes = true; } diff --git a/qga/commands-win32.c b/qga/commands-win32.c index a1015757d8..9e820aad8d 100644 --- a/qga/commands-win32.c +++ b/qga/commands-win32.c @@ -1143,6 +1143,7 @@ static GuestFilesystemInfo *build_guest_fsinfo(char *guid, Error **errp) fs = g_malloc(sizeof(*fs)); fs->name = g_strdup(guid); fs->has_total_bytes = false; +fs->has_total_bytes_root = false; fs->has_used_bytes = false; if (len == 0) { fs->mountpoint = g_strdup("System Reserved"); diff --git a/qga/qapi-schema.json b/qga/qapi-schema.json index b8efe31897..093a5ab602 100644 --- a/qga/qapi-schema.json +++ b/qga/qapi-schema.json @@ -1031,8 +1031,18 @@ # @type: file system type string # # @used-bytes: file system used bytes (since 3.0) +# * POSIX: (f_blocks - f_bfree) * f_frsize, as returned by statvfs(3) +# * Windows: (TotalNumberOfBytes - TotalNumberOfFreeBytes), as returned +# by GetDiskFreeSpaceEx() # # @total-bytes: non-root file system total bytes (since 3.0) +# * POSIX: (f_blocks - f_bfree + f_bavail) * f_frsize, as returned by +# statvfs(3) +# * Windows: TotalNumberOfBytes, as returned by GetDiskFreeSpaceEx() +# +# @total-bytes-root: total file system size in bytes (as visible for a +# priviledged user) (since 8.3) +# * POSIX only: (f_blocks * f_frsize), returned by statvfs(3) # # @disk: an array of disk hardware information that the volume lies # on, which may be empty if the disk type is not supported @@ -1042,7 +1052,7 @@ { 'struct': 'GuestFilesystemInfo', 'data': {'name': 'str', 'mountpoint': 'str', 'type': 'str', '*used-bytes': 'uint64', '*total-bytes': 'uint64', - 'disk': ['GuestDiskAddress']} } + '*total-bytes-root': 'uint64', 'disk': ['GuestDiskAddress']} } ## # @guest-get-fsinfo: -- 2.39.3
[PATCH v2 6/7] qga/commands-posix: use ga_run_command helper when suspending via sysfs
We replace the direct call to open() with a "sh -c 'echo ...'" call, so that it becomes an executable command. Signed-off-by: Andrey Drobyshev --- qga/commands-posix.c | 36 1 file changed, 4 insertions(+), 32 deletions(-) diff --git a/qga/commands-posix.c b/qga/commands-posix.c index dd2a7ad2e6..f3f4a05e2d 100644 --- a/qga/commands-posix.c +++ b/qga/commands-posix.c @@ -1921,49 +1921,21 @@ static void linux_sys_state_suspend(SuspendMode mode, Error **errp) Error *local_err = NULL; const char *sysfile_strs[3] = {"disk", "mem", NULL}; const char *sysfile_str = sysfile_strs[mode]; -pid_t pid; -int status; if (!sysfile_str) { error_setg(errp, "unknown guest suspend mode"); return; } -pid = fork(); -if (!pid) { -/* child */ -int fd; - -setsid(); -reopen_fd_to_null(0); -reopen_fd_to_null(1); -reopen_fd_to_null(2); - -fd = open(LINUX_SYS_STATE_FILE, O_WRONLY); -if (fd < 0) { -_exit(EXIT_FAILURE); -} - -if (write(fd, sysfile_str, strlen(sysfile_str)) < 0) { -_exit(EXIT_FAILURE); -} - -_exit(EXIT_SUCCESS); -} else if (pid < 0) { -error_setg_errno(errp, errno, "failed to create child process"); -return; -} +g_autofree char *echo_cmd = g_strdup_printf( +"echo %s > " LINUX_SYS_STATE_FILE, sysfile_str); +const char *argv[] = {"sh", "-c", echo_cmd, NULL}; -ga_wait_child(pid, , _err); +ga_run_command(argv, NULL, "suspend", _err); if (local_err) { error_propagate(errp, local_err); return; } - -if (WEXITSTATUS(status)) { -error_setg(errp, "child process has failed to suspend"); -} - } static void guest_suspend(SuspendMode mode, Error **errp) -- 2.39.3
[PATCH v2 5/7] qga/commands-posix: execute_fsfreeze_hook: use ga_run_command helper
There's no need to check for the existence of the hook executable, as the exec() call will do that for us. Signed-off-by: Andrey Drobyshev --- qga/commands-posix.c | 35 +++ 1 file changed, 3 insertions(+), 32 deletions(-) diff --git a/qga/commands-posix.c b/qga/commands-posix.c index e44203a273..dd2a7ad2e6 100644 --- a/qga/commands-posix.c +++ b/qga/commands-posix.c @@ -726,8 +726,6 @@ static const char *fsfreeze_hook_arg_string[] = { static void execute_fsfreeze_hook(FsfreezeHookArg arg, Error **errp) { -int status; -pid_t pid; const char *hook; const char *arg_str = fsfreeze_hook_arg_string[arg]; Error *local_err = NULL; @@ -736,42 +734,15 @@ static void execute_fsfreeze_hook(FsfreezeHookArg arg, Error **errp) if (!hook) { return; } -if (access(hook, X_OK) != 0) { -error_setg_errno(errp, errno, "can't access fsfreeze hook '%s'", hook); -return; -} -slog("executing fsfreeze hook with arg '%s'", arg_str); -pid = fork(); -if (pid == 0) { -setsid(); -reopen_fd_to_null(0); -reopen_fd_to_null(1); -reopen_fd_to_null(2); - -execl(hook, hook, arg_str, NULL); -_exit(EXIT_FAILURE); -} else if (pid < 0) { -error_setg_errno(errp, errno, "failed to create child process"); -return; -} +const char *argv[] = {hook, arg_str, NULL}; -ga_wait_child(pid, , _err); +slog("executing fsfreeze hook with arg '%s'", arg_str); +ga_run_command(argv, NULL, "execute fsfreeze hook", _err); if (local_err) { error_propagate(errp, local_err); return; } - -if (!WIFEXITED(status)) { -error_setg(errp, "fsfreeze hook has terminated abnormally"); -return; -} - -status = WEXITSTATUS(status); -if (status) { -error_setg(errp, "fsfreeze hook has failed with status %d", status); -return; -} } /* -- 2.39.3
Re: [PATCH 1/7] qga/commands-posix: return fsinfo values directly as reported by statvfs
On 2/28/24 09:55, Marc-André Lureau wrote: > [Вы нечасто получаете письма от marcandre.lur...@redhat.com. Узнайте, почему > это важно, по адресу https://aka.ms/LearnAboutSenderIdentification ] > > Hi > > On Tue, Feb 27, 2024 at 4:38 PM Andrey Drobyshev > wrote: >> >> >> >> On 2/26/24 20:50, Konstantin Kostiuk wrote: >>> >>> Best Regards, >>> Konstantin Kostiuk. >>> >>> >>> On Mon, Feb 26, 2024 at 7:02 PM Andrey Drobyshev >>> mailto:andrey.drobys...@virtuozzo.com>> >>> wrote: >>> >>> Since the commit 25b5ff1a86 ("qga: add mountpoint usage info to >>> GuestFilesystemInfo") we have 2 values reported in guest-get-fsinfo: >>> used = (f_blocks - f_bfree), total = (f_blocks - f_bfree + f_bavail). >>> These calculations might be obscure for the end user and require one to >>> actually get into QGA source to understand how they're obtained. Let's >>> just report the values f_blocks, f_bfree, f_bavail (in bytes) from >>> statvfs() as they are, letting the user decide how to process them >>> further. >>> >>> Originally-by: Yuri Pudgorodskiy >> <mailto:y...@virtuozzo.com>> >>> Signed-off-by: Andrey Drobyshev >> <mailto:andrey.drobys...@virtuozzo.com>> >>> --- >>> qga/commands-posix.c | 16 +++- >>> qga/qapi-schema.json | 11 +++ >>> 2 files changed, 14 insertions(+), 13 deletions(-) >>> >>> diff --git a/qga/commands-posix.c b/qga/commands-posix.c >>> index 26008db497..752ef509d0 100644 >>> --- a/qga/commands-posix.c >>> +++ b/qga/commands-posix.c >>> @@ -1554,8 +1554,7 @@ static GuestFilesystemInfo >>> *build_guest_fsinfo(strua5a0239ce5ct FsMount *mount, >>> Error **errp) >>> { >>> GuestFilesystemInfo *fs = g_malloc0(sizeof(*fs)); >>> -struct statvfs buf; >>> -unsigned long used, nonroot_total, fr_size; >>> +struct statvfs st; >>> char *devpath = g_strdup_printf("/sys/dev/block/%u:%u", >>> mount->devmajor, mount->devminor); >>> >>> @@ -1563,15 +1562,14 @@ static GuestFilesystemInfo >>> *build_guest_fsinfo(struct FsMount *mount, >>> fs->type = g_strdup(mount->devtype); >>> build_guest_fsinfo_for_device(devpath, fs, errp); >>> >>> -if (statvfs(fs->mountpoint, ) == 0) { >>> -fr_size = buf.f_frsize; >>> -used = buf.f_blocks - buf.f_bfree; >>> -nonroot_total = used + buf.f_bavail; >>> -fs->used_bytes = used * fr_size; >>> -fs->total_bytes = nonroot_total * fr_size; >>> +if (statvfs(fs->mountpoint, ) == 0) { >>> +fs->total_bytes = st.f_blocks * st.f_frsize; >>> +fs->free_bytes = st.f_bfree * st.f_frsize; >>> +fs->avail_bytes = st.f_bavail * st.f_frsize; >>> >>> fs->has_total_bytes = true; >>> -fs->has_used_bytes = true; >>> +fs->has_free_bytes = true; >>> +fs->has_avail_bytes = true; >>> } >>> >>> g_free(devpath); >>> diff --git a/qga/qapi-schema.json b/qga/qapi-schema.json >>> index b8efe31897..1cce3c1df5 100644 >>> --- a/qga/qapi-schema.json >>> +++ b/qga/qapi-schema.json >>> @@ -1030,9 +1030,12 @@ >>> # >>> # @type: file system type string >>> # >>> -# @used-bytes: file system used bytes (since 3.0) >>> +# @total-bytes: total file system size in bytes (since 8.3) >>> # >>> -# @total-bytes: non-root file system total bytes (since 3.0) >>> +# @free-bytes: amount of free space in file system in bytes (since 8.3) >>> >>> >>> I don't agree with this as it breaks backward compatibility. If we want >>> to get >>> these changes we should release a new version with both old and new fields >>> and mark old as deprecated to get a time for everyone who uses this >>> API updates its solutions. >>> >>> A similar thing was with replacing the 'blacklist' command line. >>> https://gitlab.com/qemu-project/qemu/-/commit/582a098e
Re: [PATCH 1/7] qga/commands-posix: return fsinfo values directly as reported by statvfs
On 2/26/24 20:50, Konstantin Kostiuk wrote: > > Best Regards, > Konstantin Kostiuk. > > > On Mon, Feb 26, 2024 at 7:02 PM Andrey Drobyshev > mailto:andrey.drobys...@virtuozzo.com>> > wrote: > > Since the commit 25b5ff1a86 ("qga: add mountpoint usage info to > GuestFilesystemInfo") we have 2 values reported in guest-get-fsinfo: > used = (f_blocks - f_bfree), total = (f_blocks - f_bfree + f_bavail). > These calculations might be obscure for the end user and require one to > actually get into QGA source to understand how they're obtained. Let's > just report the values f_blocks, f_bfree, f_bavail (in bytes) from > statvfs() as they are, letting the user decide how to process them > further. > > Originally-by: Yuri Pudgorodskiy <mailto:y...@virtuozzo.com>> > Signed-off-by: Andrey Drobyshev <mailto:andrey.drobys...@virtuozzo.com>> > --- > qga/commands-posix.c | 16 +++- > qga/qapi-schema.json | 11 +++ > 2 files changed, 14 insertions(+), 13 deletions(-) > > diff --git a/qga/commands-posix.c b/qga/commands-posix.c > index 26008db497..752ef509d0 100644 > --- a/qga/commands-posix.c > +++ b/qga/commands-posix.c > @@ -1554,8 +1554,7 @@ static GuestFilesystemInfo > *build_guest_fsinfo(strua5a0239ce5ct FsMount *mount, > Error **errp) > { > GuestFilesystemInfo *fs = g_malloc0(sizeof(*fs)); > - struct statvfs buf; > - unsigned long used, nonroot_total, fr_size; > + struct statvfs st; > char *devpath = g_strdup_printf("/sys/dev/block/%u:%u", > mount->devmajor, mount->devminor); > > @@ -1563,15 +1562,14 @@ static GuestFilesystemInfo > *build_guest_fsinfo(struct FsMount *mount, > fs->type = g_strdup(mount->devtype); > build_guest_fsinfo_for_device(devpath, fs, errp); > > - if (statvfs(fs->mountpoint, ) == 0) { > - fr_size = buf.f_frsize; > - used = buf.f_blocks - buf.f_bfree; > - nonroot_total = used + buf.f_bavail; > - fs->used_bytes = used * fr_size; > - fs->total_bytes = nonroot_total * fr_size; > + if (statvfs(fs->mountpoint, ) == 0) { > + fs->total_bytes = st.f_blocks * st.f_frsize; > + fs->free_bytes = st.f_bfree * st.f_frsize; > + fs->avail_bytes = st.f_bavail * st.f_frsize; > > fs->has_total_bytes = true; > - fs->has_used_bytes = true; > + fs->has_free_bytes = true; > + fs->has_avail_bytes = true; > } > > g_free(devpath); > diff --git a/qga/qapi-schema.json b/qga/qapi-schema.json > index b8efe31897..1cce3c1df5 100644 > --- a/qga/qapi-schema.json > +++ b/qga/qapi-schema.json > @@ -1030,9 +1030,12 @@ > # > # @type: file system type string > # > -# @used-bytes: file system used bytes (since 3.0) > +# @total-bytes: total file system size in bytes (since 8.3) > # > -# @total-bytes: non-root file system total bytes (since 3.0) > +# @free-bytes: amount of free space in file system in bytes (since 8.3) > > > I don't agree with this as it breaks backward compatibility. If we want > to get > these changes we should release a new version with both old and new fields > and mark old as deprecated to get a time for everyone who uses this > API updates its solutions. > > A similar thing was with replacing the 'blacklist' command line. > https://gitlab.com/qemu-project/qemu/-/commit/582a098e6ca00dd42f317dad8affd13e5a20bc42 > > <https://gitlab.com/qemu-project/qemu/-/commit/582a098e6ca00dd42f317dad8affd13e5a20bc42> > Currently, we support both 'blacklist' and 'block-rpcs' command line options > but the first one wrote a warning. > I agree that marking the old values as deprecated does make sense. Although my original intent with this patch is to make more sense of the existing names (e.g. total-bytes to indicate true fs size instead of just non-root fs). If so, we'd eventually have to replace the original total-bytes value with the one having new semantics. Or we could rename the existing value to smth like "total-bytes-nonroot". But either way breaks backward compatibility after all. How would you suggest to resolve it? Andrey > @Marc-André Lureau <mailto:marcandre.lur...@redhat.com> @Philippe > Mathieu-Daudé <mailto:phi...@linaro.org> > What do you think about this? > > [...]
[PATCH 7/7] qga/commands-posix: qmp_guest_set_user_password: use ga_run_command helper
There's no need to check for the existence of the "chpasswd", "pw" executables, as the exec() call will do that for us. Signed-off-by: Andrey Drobyshev --- qga/commands-posix.c | 96 ++-- 1 file changed, 13 insertions(+), 83 deletions(-) diff --git a/qga/commands-posix.c b/qga/commands-posix.c index 599f59a323..7f39636417 100644 --- a/qga/commands-posix.c +++ b/qga/commands-posix.c @@ -2140,14 +2140,8 @@ void qmp_guest_set_user_password(const char *username, Error **errp) { Error *local_err = NULL; -char *passwd_path = NULL; -pid_t pid; -int status; -int datafd[2] = { -1, -1 }; -char *rawpasswddata = NULL; +g_autofree char *rawpasswddata = NULL; size_t rawpasswdlen; -char *chpasswddata = NULL; -size_t chpasswdlen; rawpasswddata = (char *)qbase64_decode(password, -1, , errp); if (!rawpasswddata) { @@ -2158,95 +2152,31 @@ void qmp_guest_set_user_password(const char *username, if (strchr(rawpasswddata, '\n')) { error_setg(errp, "forbidden characters in raw password"); -goto out; +return; } if (strchr(username, '\n') || strchr(username, ':')) { error_setg(errp, "forbidden characters in username"); -goto out; +return; } #ifdef __FreeBSD__ -chpasswddata = g_strdup(rawpasswddata); -passwd_path = g_find_program_in_path("pw"); +g_autofree char *chpasswdata = g_strdup(rawpasswddata); +const char *crypt_flag = (crypted) ? "-H" : "-h"; +const char *argv[] = {"pw", "usermod", "-n", username, + crypt_flag, "0", NULL}; #else -chpasswddata = g_strdup_printf("%s:%s\n", username, rawpasswddata); -passwd_path = g_find_program_in_path("chpasswd"); +g_autofree char *chpasswddata = g_strdup_printf("%s:%s\n", username, +rawpasswddata); +const char *crypt_flag = (crypted) ? "-e" : NULL; +const char *argv[] = {"chpasswd", crypt_flag, NULL}; #endif -chpasswdlen = strlen(chpasswddata); - -if (!passwd_path) { -error_setg(errp, "cannot find 'passwd' program in PATH"); -goto out; -} - -if (!g_unix_open_pipe(datafd, FD_CLOEXEC, NULL)) { -error_setg(errp, "cannot create pipe FDs"); -goto out; -} - -pid = fork(); -if (pid == 0) { -close(datafd[1]); -/* child */ -setsid(); -dup2(datafd[0], 0); -reopen_fd_to_null(1); -reopen_fd_to_null(2); - -#ifdef __FreeBSD__ -const char *h_arg; -h_arg = (crypted) ? "-H" : "-h"; -execl(passwd_path, "pw", "usermod", "-n", username, h_arg, "0", NULL); -#else -if (crypted) { -execl(passwd_path, "chpasswd", "-e", NULL); -} else { -execl(passwd_path, "chpasswd", NULL); -} -#endif -_exit(EXIT_FAILURE); -} else if (pid < 0) { -error_setg_errno(errp, errno, "failed to create child process"); -goto out; -} -close(datafd[0]); -datafd[0] = -1; - -if (qemu_write_full(datafd[1], chpasswddata, chpasswdlen) != chpasswdlen) { -error_setg_errno(errp, errno, "cannot write new account password"); -goto out; -} -close(datafd[1]); -datafd[1] = -1; - -ga_wait_child(pid, , _err); +ga_run_command(argv, chpasswddata, "set user password", _err); if (local_err) { error_propagate(errp, local_err); -goto out; -} - -if (!WIFEXITED(status)) { -error_setg(errp, "child process has terminated abnormally"); -goto out; -} - -if (WEXITSTATUS(status)) { -error_setg(errp, "child process has failed to set user password"); -goto out; -} - -out: -g_free(chpasswddata); -g_free(rawpasswddata); -g_free(passwd_path); -if (datafd[0] != -1) { -close(datafd[0]); -} -if (datafd[1] != -1) { -close(datafd[1]); +return; } } #else /* __linux__ || __FreeBSD__ */ -- 2.39.3
[PATCH 1/7] qga/commands-posix: return fsinfo values directly as reported by statvfs
Since the commit 25b5ff1a86 ("qga: add mountpoint usage info to GuestFilesystemInfo") we have 2 values reported in guest-get-fsinfo: used = (f_blocks - f_bfree), total = (f_blocks - f_bfree + f_bavail). These calculations might be obscure for the end user and require one to actually get into QGA source to understand how they're obtained. Let's just report the values f_blocks, f_bfree, f_bavail (in bytes) from statvfs() as they are, letting the user decide how to process them further. Originally-by: Yuri Pudgorodskiy Signed-off-by: Andrey Drobyshev --- qga/commands-posix.c | 16 +++- qga/qapi-schema.json | 11 +++ 2 files changed, 14 insertions(+), 13 deletions(-) diff --git a/qga/commands-posix.c b/qga/commands-posix.c index 26008db497..752ef509d0 100644 --- a/qga/commands-posix.c +++ b/qga/commands-posix.c @@ -1554,8 +1554,7 @@ static GuestFilesystemInfo *build_guest_fsinfo(struct FsMount *mount, Error **errp) { GuestFilesystemInfo *fs = g_malloc0(sizeof(*fs)); -struct statvfs buf; -unsigned long used, nonroot_total, fr_size; +struct statvfs st; char *devpath = g_strdup_printf("/sys/dev/block/%u:%u", mount->devmajor, mount->devminor); @@ -1563,15 +1562,14 @@ static GuestFilesystemInfo *build_guest_fsinfo(struct FsMount *mount, fs->type = g_strdup(mount->devtype); build_guest_fsinfo_for_device(devpath, fs, errp); -if (statvfs(fs->mountpoint, ) == 0) { -fr_size = buf.f_frsize; -used = buf.f_blocks - buf.f_bfree; -nonroot_total = used + buf.f_bavail; -fs->used_bytes = used * fr_size; -fs->total_bytes = nonroot_total * fr_size; +if (statvfs(fs->mountpoint, ) == 0) { +fs->total_bytes = st.f_blocks * st.f_frsize; +fs->free_bytes = st.f_bfree * st.f_frsize; +fs->avail_bytes = st.f_bavail * st.f_frsize; fs->has_total_bytes = true; -fs->has_used_bytes = true; +fs->has_free_bytes = true; +fs->has_avail_bytes = true; } g_free(devpath); diff --git a/qga/qapi-schema.json b/qga/qapi-schema.json index b8efe31897..1cce3c1df5 100644 --- a/qga/qapi-schema.json +++ b/qga/qapi-schema.json @@ -1030,9 +1030,12 @@ # # @type: file system type string # -# @used-bytes: file system used bytes (since 3.0) +# @total-bytes: total file system size in bytes (since 8.3) # -# @total-bytes: non-root file system total bytes (since 3.0) +# @free-bytes: amount of free space in file system in bytes (since 8.3) +# +# @avail-bytes: amount of free space in file system for unprivileged +# users in bytes (since 8.3) # # @disk: an array of disk hardware information that the volume lies # on, which may be empty if the disk type is not supported @@ -1041,8 +1044,8 @@ ## { 'struct': 'GuestFilesystemInfo', 'data': {'name': 'str', 'mountpoint': 'str', 'type': 'str', - '*used-bytes': 'uint64', '*total-bytes': 'uint64', - 'disk': ['GuestDiskAddress']} } + '*total-bytes': 'uint64', '*free-bytes': 'uint64', + '*avail-bytes': 'uint64', 'disk': ['GuestDiskAddress']} } ## # @guest-get-fsinfo: -- 2.39.3
[PATCH 3/7] qga/commands-posix: qmp_guest_shutdown: use ga_run_command helper
Also remove the G_GNUC_UNUSED attribute added in the previous commit from the helper. Signed-off-by: Andrey Drobyshev --- qga/commands-posix.c | 39 ++- 1 file changed, 6 insertions(+), 33 deletions(-) diff --git a/qga/commands-posix.c b/qga/commands-posix.c index 510b902b1a..4b64716bab 100644 --- a/qga/commands-posix.c +++ b/qga/commands-posix.c @@ -103,7 +103,6 @@ static void ga_pipe_read_str(int fd[2], char **str, size_t *len) * sending string to stdin and taking error message from * stdout/err. */ -G_GNUC_UNUSED static int ga_run_command(const char *argv[], const char *in_str, const char *action, Error **errp) { @@ -220,8 +219,6 @@ void qmp_guest_shutdown(const char *mode, Error **errp) { const char *shutdown_flag; Error *local_err = NULL; -pid_t pid; -int status; #ifdef CONFIG_SOLARIS const char *powerdown_flag = "-i5"; @@ -250,46 +247,22 @@ void qmp_guest_shutdown(const char *mode, Error **errp) return; } -pid = fork(); -if (pid == 0) { -/* child, start the shutdown */ -setsid(); -reopen_fd_to_null(0); -reopen_fd_to_null(1); -reopen_fd_to_null(2); - +const char *argv[] = {"/sbin/shutdown", #ifdef CONFIG_SOLARIS -execl("/sbin/shutdown", "shutdown", shutdown_flag, "-g0", "-y", - "hypervisor initiated shutdown", (char *)NULL); + shutdown_flag, "-g0", "-y", #elif defined(CONFIG_BSD) -execl("/sbin/shutdown", "shutdown", shutdown_flag, "+0", - "hypervisor initiated shutdown", (char *)NULL); + shutdown_flag, "+0", #else -execl("/sbin/shutdown", "shutdown", "-h", shutdown_flag, "+0", - "hypervisor initiated shutdown", (char *)NULL); + "-h", shutdown_flag, "+0", #endif -_exit(EXIT_FAILURE); -} else if (pid < 0) { -error_setg_errno(errp, errno, "failed to create child process"); -return; -} + "hypervisor initiated shutdown", (char *) NULL}; -ga_wait_child(pid, , _err); +ga_run_command(argv, NULL, "shutdown", _err); if (local_err) { error_propagate(errp, local_err); return; } -if (!WIFEXITED(status)) { -error_setg(errp, "child process has terminated abnormally"); -return; -} - -if (WEXITSTATUS(status)) { -error_setg(errp, "child process has failed to shutdown"); -return; -} - /* succeeded */ } -- 2.39.3
[PATCH 2/7] qga: introduce ga_run_command() helper for guest cmd execution
When executing guest commands in *nix environment, we repeat the same fork/exec pattern multiple times. Let's just separate it into a single helper which would also be able to feed input data into the launched process' stdin. This way we can avoid code duplication. To keep the history more bisectable, let's replace qmp commands implementations one by one. Also add G_GNUC_UNUSED attribute to the helper and remove it in the next commit. Originally-by: Yuri Pudgorodskiy Signed-off-by: Andrey Drobyshev --- qga/commands-posix.c | 140 +++ 1 file changed, 140 insertions(+) diff --git a/qga/commands-posix.c b/qga/commands-posix.c index 752ef509d0..510b902b1a 100644 --- a/qga/commands-posix.c +++ b/qga/commands-posix.c @@ -76,6 +76,146 @@ static void ga_wait_child(pid_t pid, int *status, Error **errp) g_assert(rpid == pid); } +static void ga_pipe_read_str(int fd[2], char **str, size_t *len) +{ +ssize_t n; +char buf[1024]; +close(fd[1]); +fd[1] = -1; +while ((n = read(fd[0], buf, sizeof(buf))) != 0) { +if (n < 0) { +if (errno == EINTR) { +continue; +} else { +break; +} +} +*str = g_realloc(*str, *len + n); +memcpy(*str + *len, buf, n); +*len += n; +} +close(fd[0]); +fd[0] = -1; +} + +/* + * Helper to run command with input/output redirection, + * sending string to stdin and taking error message from + * stdout/err. + */ +G_GNUC_UNUSED +static int ga_run_command(const char *argv[], const char *in_str, + const char *action, Error **errp) +{ +pid_t pid; +int status; +int retcode = -1; +int infd[2] = { -1, -1 }; +int outfd[2] = { -1, -1 }; +char *str = NULL; +size_t len = 0; + +if ((in_str && !g_unix_open_pipe(infd, FD_CLOEXEC, NULL)) || +!g_unix_open_pipe(outfd, FD_CLOEXEC, NULL)) { +error_setg(errp, "cannot create pipe FDs"); +goto out; +} + +pid = fork(); +if (pid == 0) { +char *cherr = NULL; + +setsid(); + +if (in_str) { +/* Redirect stdin to infd. */ +close(infd[1]); +dup2(infd[0], 0); +close(infd[0]); +} else { +reopen_fd_to_null(0); +} + +/* Redirect stdout/stderr to outfd. */ +close(outfd[0]); +dup2(outfd[1], 1); +dup2(outfd[1], 2); +close(outfd[1]); + +execvp(argv[0], (char *const *)argv); + +/* Write the cause of failed exec to pipe for the parent to read it. */ +cherr = g_strdup_printf("failed to exec '%s'", argv[0]); +perror(cherr); +g_free(cherr); +_exit(EXIT_FAILURE); +} else if (pid < 0) { +error_setg_errno(errp, errno, "failed to create child process"); +goto out; +} + +if (in_str) { +close(infd[0]); +infd[0] = -1; +if (qemu_write_full(infd[1], in_str, strlen(in_str)) != +strlen(in_str)) { +error_setg_errno(errp, errno, "%s: cannot write to stdin pipe", + action); +goto out; +} +close(infd[1]); +infd[1] = -1; +} + +ga_pipe_read_str(outfd, , ); + +ga_wait_child(pid, , errp); +if (*errp) { +goto out; +} + +if (!WIFEXITED(status)) { +if (len) { +error_setg(errp, "child process has terminated abnormally: %s", + str); +} else { +error_setg(errp, "child process has terminated abnormally"); +} +goto out; +} + +retcode = WEXITSTATUS(status); + +if (WEXITSTATUS(status)) { +if (len) { +error_setg(errp, "child process has failed to %s: %s", + action, str); +} else { +error_setg(errp, "child process has failed to %s: exit status %d", + action, WEXITSTATUS(status)); +} +goto out; +} + +out: +g_free(str); + +if (infd[0] != -1) { +close(infd[0]); +} +if (infd[1] != -1) { +close(infd[1]); +} +if (outfd[0] != -1) { +close(outfd[0]); +} +if (outfd[1] != -1) { +close(outfd[1]); +} + +return retcode; +} + void qmp_guest_shutdown(const char *mode, Error **errp) { const char *shutdown_flag; -- 2.39.3
[PATCH 6/7] qga/commands-posix: use ga_run_command helper when suspending via sysfs
We replace the direct call to open() with a "sh -c 'echo ...'" call, so that it becomes an executable command. Signed-off-by: Andrey Drobyshev --- qga/commands-posix.c | 36 1 file changed, 4 insertions(+), 32 deletions(-) diff --git a/qga/commands-posix.c b/qga/commands-posix.c index 6baaabc29e..599f59a323 100644 --- a/qga/commands-posix.c +++ b/qga/commands-posix.c @@ -1917,49 +1917,21 @@ static void linux_sys_state_suspend(SuspendMode mode, Error **errp) Error *local_err = NULL; const char *sysfile_strs[3] = {"disk", "mem", NULL}; const char *sysfile_str = sysfile_strs[mode]; -pid_t pid; -int status; if (!sysfile_str) { error_setg(errp, "unknown guest suspend mode"); return; } -pid = fork(); -if (!pid) { -/* child */ -int fd; - -setsid(); -reopen_fd_to_null(0); -reopen_fd_to_null(1); -reopen_fd_to_null(2); - -fd = open(LINUX_SYS_STATE_FILE, O_WRONLY); -if (fd < 0) { -_exit(EXIT_FAILURE); -} - -if (write(fd, sysfile_str, strlen(sysfile_str)) < 0) { -_exit(EXIT_FAILURE); -} - -_exit(EXIT_SUCCESS); -} else if (pid < 0) { -error_setg_errno(errp, errno, "failed to create child process"); -return; -} +g_autofree char *echo_cmd = g_strdup_printf( +"echo %s > " LINUX_SYS_STATE_FILE, sysfile_str); +const char *argv[] = {"sh", "-c", echo_cmd, NULL}; -ga_wait_child(pid, , _err); +ga_run_command(argv, NULL, "suspend", _err); if (local_err) { error_propagate(errp, local_err); return; } - -if (WEXITSTATUS(status)) { -error_setg(errp, "child process has failed to suspend"); -} - } static void guest_suspend(SuspendMode mode, Error **errp) -- 2.39.3
[PATCH 0/7] qga/commands-posix: replace code duplicating commands with a helper
This series simply replaces repeating fork()/exec() pattern with a separate helper to avoid code duplication. It's easier to setup and use than g_spawn_async_with_pipes() (which we'd need since some commands require input). While here, also make qmp_guest_get_fsinfo return more straightforward values. Andrey Drobyshev (7): qga/commands-posix: return fsinfo values directly as reported by statvfs qga: introduce ga_run_command() helper for guest cmd execution qga/commands-posix: qmp_guest_shutdown: use ga_run_command helper qga/commands-posix: qmp_guest_set_time: use ga_run_command helper qga/commands-posix: execute_fsfreeze_hook: use ga_run_command helper qga/commands-posix: use ga_run_command helper when suspending via sysfs qga/commands-posix: qmp_guest_set_user_password: use ga_run_command helper qga/commands-posix.c | 402 +++ qga/qapi-schema.json | 11 +- 2 files changed, 181 insertions(+), 232 deletions(-) -- 2.39.3
[PATCH 5/7] qga/commands-posix: execute_fsfreeze_hook: use ga_run_command helper
There's no need to check for the existence of the hook executable, as the exec() call will do that for us. Signed-off-by: Andrey Drobyshev --- qga/commands-posix.c | 35 +++ 1 file changed, 3 insertions(+), 32 deletions(-) diff --git a/qga/commands-posix.c b/qga/commands-posix.c index b704006c98..6baaabc29e 100644 --- a/qga/commands-posix.c +++ b/qga/commands-posix.c @@ -726,8 +726,6 @@ static const char *fsfreeze_hook_arg_string[] = { static void execute_fsfreeze_hook(FsfreezeHookArg arg, Error **errp) { -int status; -pid_t pid; const char *hook; const char *arg_str = fsfreeze_hook_arg_string[arg]; Error *local_err = NULL; @@ -736,42 +734,15 @@ static void execute_fsfreeze_hook(FsfreezeHookArg arg, Error **errp) if (!hook) { return; } -if (access(hook, X_OK) != 0) { -error_setg_errno(errp, errno, "can't access fsfreeze hook '%s'", hook); -return; -} -slog("executing fsfreeze hook with arg '%s'", arg_str); -pid = fork(); -if (pid == 0) { -setsid(); -reopen_fd_to_null(0); -reopen_fd_to_null(1); -reopen_fd_to_null(2); - -execl(hook, hook, arg_str, NULL); -_exit(EXIT_FAILURE); -} else if (pid < 0) { -error_setg_errno(errp, errno, "failed to create child process"); -return; -} +const char *argv[] = {hook, arg_str, NULL}; -ga_wait_child(pid, , _err); +slog("executing fsfreeze hook with arg '%s'", arg_str); +ga_run_command(argv, NULL, "execute fsfreeze hook", _err); if (local_err) { error_propagate(errp, local_err); return; } - -if (!WIFEXITED(status)) { -error_setg(errp, "fsfreeze hook has terminated abnormally"); -return; -} - -status = WEXITSTATUS(status); -if (status) { -error_setg(errp, "fsfreeze hook has failed with status %d", status); -return; -} } /* -- 2.39.3
[PATCH 4/7] qga/commands-posix: qmp_guest_set_time: use ga_run_command helper
There's no need to check for the existence of "/sbin/hwclock", the exec() call will do that for us. Signed-off-by: Andrey Drobyshev --- qga/commands-posix.c | 43 +++ 1 file changed, 3 insertions(+), 40 deletions(-) diff --git a/qga/commands-posix.c b/qga/commands-posix.c index 4b64716bab..b704006c98 100644 --- a/qga/commands-posix.c +++ b/qga/commands-posix.c @@ -269,21 +269,9 @@ void qmp_guest_shutdown(const char *mode, Error **errp) void qmp_guest_set_time(bool has_time, int64_t time_ns, Error **errp) { int ret; -int status; -pid_t pid; Error *local_err = NULL; struct timeval tv; -static const char hwclock_path[] = "/sbin/hwclock"; -static int hwclock_available = -1; - -if (hwclock_available < 0) { -hwclock_available = (access(hwclock_path, X_OK) == 0); -} - -if (!hwclock_available) { -error_setg(errp, QERR_UNSUPPORTED); -return; -} +const char *argv[] = {"/sbin/hwclock", has_time ? "-w" : "-s", NULL}; /* If user has passed a time, validate and set it. */ if (has_time) { @@ -314,37 +302,12 @@ void qmp_guest_set_time(bool has_time, int64_t time_ns, Error **errp) * just need to synchronize the hardware clock. However, if no time was * passed, user is requesting the opposite: set the system time from the * hardware clock (RTC). */ -pid = fork(); -if (pid == 0) { -setsid(); -reopen_fd_to_null(0); -reopen_fd_to_null(1); -reopen_fd_to_null(2); - -/* Use '/sbin/hwclock -w' to set RTC from the system time, - * or '/sbin/hwclock -s' to set the system time from RTC. */ -execl(hwclock_path, "hwclock", has_time ? "-w" : "-s", NULL); -_exit(EXIT_FAILURE); -} else if (pid < 0) { -error_setg_errno(errp, errno, "failed to create child process"); -return; -} - -ga_wait_child(pid, , _err); +ga_run_command(argv, NULL, "set hardware clock to system time", + _err); if (local_err) { error_propagate(errp, local_err); return; } - -if (!WIFEXITED(status)) { -error_setg(errp, "child process has terminated abnormally"); -return; -} - -if (WEXITSTATUS(status)) { -error_setg(errp, "hwclock failed to set hardware clock to system time"); -return; -} } typedef enum { -- 2.39.3
Re: [PATCH] kvm: emit GUEST_PANICKED event in case of abnormal KVM exit
On 1/25/24 18:36, Andrey Drobyshev wrote: > On 1/11/24 15:15, Andrey Drobyshev wrote: >> On 11/1/23 18:13, Denis V. Lunev wrote: >>> On 11/1/23 16:23, Andrey Drobyshev wrote: >>>> Currently we emit GUEST_PANICKED event in case kvm_vcpu_ioctl() returns >>>> KVM_EXIT_SYSTEM_EVENT with the event type KVM_SYSTEM_EVENT_CRASH. Let's >>>> extend this scenario and emit GUEST_PANICKED in case of an abnormal KVM >>>> exit. That's a natural thing to do since in this case guest is no >>>> longer operational anyway. >>>> >>>> Signed-off-by: Andrey Drobyshev >>>> Acked-by: Denis V. Lunev >>>> --- >>>> accel/kvm/kvm-all.c | 19 +++ >>>> 1 file changed, 15 insertions(+), 4 deletions(-) >>>> >>>> diff --git a/accel/kvm/kvm-all.c b/accel/kvm/kvm-all.c >>>> index e39a810a4e..d74b3f0b0e 100644 >>>> --- a/accel/kvm/kvm-all.c >>>> +++ b/accel/kvm/kvm-all.c >>>> @@ -2816,6 +2816,14 @@ static void kvm_eat_signals(CPUState *cpu) >>>> } while (sigismember(, SIG_IPI)); >>>> } >>>> +static void kvm_emit_guest_crash(CPUState *cpu) >>>> +{ >>>> + kvm_cpu_synchronize_state(cpu); >>>> + qemu_mutex_lock_iothread(); >>>> + qemu_system_guest_panicked(cpu_get_crash_info(cpu)); >>>> + qemu_mutex_unlock_iothread(); >>>> +} >>>> + >>>> int kvm_cpu_exec(CPUState *cpu) >>>> { >>>> struct kvm_run *run = cpu->kvm_run; >>>> @@ -2969,21 +2977,24 @@ int kvm_cpu_exec(CPUState *cpu) >>>> ret = EXCP_INTERRUPT; >>>> break; >>>> case KVM_SYSTEM_EVENT_CRASH: >>>> - kvm_cpu_synchronize_state(cpu); >>>> - qemu_mutex_lock_iothread(); >>>> - qemu_system_guest_panicked(cpu_get_crash_info(cpu)); >>>> - qemu_mutex_unlock_iothread(); >>>> + kvm_emit_guest_crash(cpu); >>>> ret = 0; >>>> break; >>>> default: >>>> DPRINTF("kvm_arch_handle_exit\n"); >>>> ret = kvm_arch_handle_exit(cpu, run); >>>> + if (ret < 0) { >>>> + kvm_emit_guest_crash(cpu); >>>> + } >>>> break; >>>> } >>>> break; >>>> default: >>>> DPRINTF("kvm_arch_handle_exit\n"); >>>> ret = kvm_arch_handle_exit(cpu, run); >>>> + if (ret < 0) { >>>> + kvm_emit_guest_crash(cpu); >>>> + } >>>> break; >>>> } >>>> } while (ret == 0); >>> This allows to gracefully handle this problem in production >>> and reset the guest using on_crash action in libvirt. >> >> Ping > > Ping Yet another ping
Re: [PATCH] block: allocate aligned write buffer for 'truncate -m full'
On 1/25/24 18:46, Vladimir Sementsov-Ogievskiy wrote: > On 11.12.23 13:55, Andrey Drobyshev wrote: >> In case we're truncating an image opened with O_DIRECT, we might get >> -EINVAL on write with unaligned buffer. In particular, when running >> iotests/298 with '-nocache' we get: >> >> qemu-io: Failed to resize underlying file: Could not write zeros for >> preallocation: Invalid argument >> >> Let's just allocate the buffer using qemu_blockalign0() instead. >> >> Signed-off-by: Andrey Drobyshev > > Reviewed-by: Vladimir Sementsov-Ogievskiy > > I also suggest to use QEMU_AUTO_VFREE (keep my r-b if you do). > >> --- >> block/file-posix.c | 4 ++-- >> 1 file changed, 2 insertions(+), 2 deletions(-) >> >> diff --git a/block/file-posix.c b/block/file-posix.c >> index b862406c71..cee8de510b 100644 >> --- a/block/file-posix.c >> +++ b/block/file-posix.c >> @@ -2354,7 +2354,7 @@ static int handle_aiocb_truncate(void *opaque) >> goto out; >> } >> - buf = g_malloc0(65536); >> + buf = qemu_blockalign0(aiocb->bs, 65536); >> seek_result = lseek(fd, current_length, SEEK_SET); >> if (seek_result < 0) { >> @@ -2413,7 +2413,7 @@ out: >> } >> } >> - g_free(buf); >> + qemu_vfree(buf); >> return result; >> } >> > Yet another ping, just checking if any of the block maintainers is interested
Re: [PATCH] iotests: don't run tests requiring cached writes in '-nocache' mode
On 1/26/24 12:24, Kevin Wolf wrote: > Am 11.12.2023 um 14:32 hat Andrey Drobyshev geschrieben: >> There're tests whose logic implies running without O_DIRECT set, >> otherwise they fail when running iotests in '-nocache' mode. For these >> tests let's add _require_no_o_direct() helper which can be put in the >> preabmle and which makes sure '-nocache' isn't set. Use it to skip >> running the following tests: >> >> * 271: creates files with unaligned sizes, thus producing multiple >> errors like: >> >> qemu-io: can't open device /path/to/t.qcow2.raw: Cannot get 'write' >> permission without 'resize': Image size is not a multiple of request >> alignment >> >> * 308, file-io-error: use fuse exports. Though fuse does have >> 'direct-io' mode (see https://docs.kernel.org/filesystems/fuse-io.html) >> we aren't using it yet, thus getting errors like: >> >> qemu-io: can't open device /path/to/t.qcow2.fuse: Could not open >> '/path/to/t.qcow2.fuse': filesystem does not support O_DIRECT >> >> Signed-off-by: Andrey Drobyshev > > How are you running qemu-iotests to make these tests fail? I tried to > reproduce, but they just pass for me: > > $ tests/qemu-iotests/check -qcow2 -nocache 271 308 file-io-error > [...] > 271 pass [11:20:50] [11:21:11] 21.1s (last: 20.4s) > 308 pass [11:21:11] [11:21:14] 3.3s (last: 3.3s) > file-io-error pass [11:21:14] [11:21:14] 0.3s (last: 0.3s) > Passed all 3 iotests > > $ tests/qemu-iotests/check -raw -nocache 271 308 file-io-error > 271 not run[11:21:20] [11:21:21] ... not > suitable for this image format: raw > 308 pass [11:21:21] [11:21:24] 3.8s (last: 2.8s) > file-io-error pass [11:21:24] [11:21:25] 0.3s (last: 0.3s) > Not run: 271 > Passed all 2 iotests > > Kevin > As for the test 271, I imagine this might be caused by different request alignment. The failure occurs in block.c, bdrv_node_refresh_perm(). If I print the alignment out explicitly, I get: qemu-io: can't open device /path/to/t.qcow2.raw: Cannot get 'write' permission without 'resize': Image size is not a multiple of request alignment: 4096 For the record, I'm running tests on ext4. I'm not sure about the fuse tests though. Could it also have smth to do with the underlying fs? Andrey
Re: [PATCH] block: allocate aligned write buffer for 'truncate -m full'
On 1/11/24 14:53, Andrey Drobyshev wrote: > On 12/11/23 13:27, Denis V. Lunev wrote: >> On 12/11/23 11:55, Andrey Drobyshev wrote: >>> In case we're truncating an image opened with O_DIRECT, we might get >>> -EINVAL on write with unaligned buffer. In particular, when running >>> iotests/298 with '-nocache' we get: >>> >>> qemu-io: Failed to resize underlying file: Could not write zeros for >>> preallocation: Invalid argument >>> >>> Let's just allocate the buffer using qemu_blockalign0() instead. >>> >>> Signed-off-by: Andrey Drobyshev >>> --- >>> block/file-posix.c | 4 ++-- >>> 1 file changed, 2 insertions(+), 2 deletions(-) >>> >>> diff --git a/block/file-posix.c b/block/file-posix.c >>> index b862406c71..cee8de510b 100644 >>> --- a/block/file-posix.c >>> +++ b/block/file-posix.c >>> @@ -2354,7 +2354,7 @@ static int handle_aiocb_truncate(void *opaque) >>> goto out; >>> } >>> - buf = g_malloc0(65536); >>> + buf = qemu_blockalign0(aiocb->bs, 65536); >>> seek_result = lseek(fd, current_length, SEEK_SET); >>> if (seek_result < 0) { >>> @@ -2413,7 +2413,7 @@ out: >>> } >>> } >>> - g_free(buf); >>> + qemu_vfree(buf); >>> return result; >>> } >>> >> Reviewed-by: Denis V. Lunev > > Ping Ping
Re: [PATCH] kvm: emit GUEST_PANICKED event in case of abnormal KVM exit
On 1/11/24 15:15, Andrey Drobyshev wrote: > On 11/1/23 18:13, Denis V. Lunev wrote: >> On 11/1/23 16:23, Andrey Drobyshev wrote: >>> Currently we emit GUEST_PANICKED event in case kvm_vcpu_ioctl() returns >>> KVM_EXIT_SYSTEM_EVENT with the event type KVM_SYSTEM_EVENT_CRASH. Let's >>> extend this scenario and emit GUEST_PANICKED in case of an abnormal KVM >>> exit. That's a natural thing to do since in this case guest is no >>> longer operational anyway. >>> >>> Signed-off-by: Andrey Drobyshev >>> Acked-by: Denis V. Lunev >>> --- >>> accel/kvm/kvm-all.c | 19 +++ >>> 1 file changed, 15 insertions(+), 4 deletions(-) >>> >>> diff --git a/accel/kvm/kvm-all.c b/accel/kvm/kvm-all.c >>> index e39a810a4e..d74b3f0b0e 100644 >>> --- a/accel/kvm/kvm-all.c >>> +++ b/accel/kvm/kvm-all.c >>> @@ -2816,6 +2816,14 @@ static void kvm_eat_signals(CPUState *cpu) >>> } while (sigismember(, SIG_IPI)); >>> } >>> +static void kvm_emit_guest_crash(CPUState *cpu) >>> +{ >>> + kvm_cpu_synchronize_state(cpu); >>> + qemu_mutex_lock_iothread(); >>> + qemu_system_guest_panicked(cpu_get_crash_info(cpu)); >>> + qemu_mutex_unlock_iothread(); >>> +} >>> + >>> int kvm_cpu_exec(CPUState *cpu) >>> { >>> struct kvm_run *run = cpu->kvm_run; >>> @@ -2969,21 +2977,24 @@ int kvm_cpu_exec(CPUState *cpu) >>> ret = EXCP_INTERRUPT; >>> break; >>> case KVM_SYSTEM_EVENT_CRASH: >>> - kvm_cpu_synchronize_state(cpu); >>> - qemu_mutex_lock_iothread(); >>> - qemu_system_guest_panicked(cpu_get_crash_info(cpu)); >>> - qemu_mutex_unlock_iothread(); >>> + kvm_emit_guest_crash(cpu); >>> ret = 0; >>> break; >>> default: >>> DPRINTF("kvm_arch_handle_exit\n"); >>> ret = kvm_arch_handle_exit(cpu, run); >>> + if (ret < 0) { >>> + kvm_emit_guest_crash(cpu); >>> + } >>> break; >>> } >>> break; >>> default: >>> DPRINTF("kvm_arch_handle_exit\n"); >>> ret = kvm_arch_handle_exit(cpu, run); >>> + if (ret < 0) { >>> + kvm_emit_guest_crash(cpu); >>> + } >>> break; >>> } >>> } while (ret == 0); >> This allows to gracefully handle this problem in production >> and reset the guest using on_crash action in libvirt. > > Ping Ping
Re: [PATCH] iotests: don't run tests requiring cached writes in '-nocache' mode
On 1/11/24 14:53, Andrey Drobyshev wrote: > On 12/11/23 15:32, Andrey Drobyshev wrote: >> There're tests whose logic implies running without O_DIRECT set, >> otherwise they fail when running iotests in '-nocache' mode. For these >> tests let's add _require_no_o_direct() helper which can be put in the >> preabmle and which makes sure '-nocache' isn't set. Use it to skip >> running the following tests: >> >> * 271: creates files with unaligned sizes, thus producing multiple >> errors like: >> >> qemu-io: can't open device /path/to/t.qcow2.raw: Cannot get 'write' >> permission without 'resize': Image size is not a multiple of request >> alignment >> >> * 308, file-io-error: use fuse exports. Though fuse does have >> 'direct-io' mode (see https://docs.kernel.org/filesystems/fuse-io.html) >> we aren't using it yet, thus getting errors like: >> >> qemu-io: can't open device /path/to/t.qcow2.fuse: Could not open >> '/path/to/t.qcow2.fuse': filesystem does not support O_DIRECT >> >> Signed-off-by: Andrey Drobyshev >> --- >> tests/qemu-iotests/271 | 1 + >> tests/qemu-iotests/308 | 2 ++ >> tests/qemu-iotests/common.rc | 7 +++ >> tests/qemu-iotests/tests/file-io-error | 1 + >> 4 files changed, 11 insertions(+) >> >> diff --git a/tests/qemu-iotests/271 b/tests/qemu-iotests/271 >> index 59a6fafa2f..1424b6954d 100755 >> --- a/tests/qemu-iotests/271 >> +++ b/tests/qemu-iotests/271 >> @@ -44,6 +44,7 @@ _supported_fmt qcow2 >> _supported_proto file nfs >> _supported_os Linux >> _unsupported_imgopts extended_l2 compat=0.10 cluster_size data_file >> refcount_bits=1[^0-9] >> +_require_no_o_direct >> >> l2_offset=$((0x4)) >> >> diff --git a/tests/qemu-iotests/308 b/tests/qemu-iotests/308 >> index de12b2b1b9..535455e5b1 100755 >> --- a/tests/qemu-iotests/308 >> +++ b/tests/qemu-iotests/308 >> @@ -52,6 +52,8 @@ _unsupported_fmt vpc >> _supported_proto file # We create the FUSE export manually >> _supported_os Linux # We need /dev/urandom >> >> +_require_no_o_direct >> + >> # $1: Export ID >> # $2: Options (beyond the node-name and ID) >> # $3: Expected return value (defaults to 'return') >> diff --git a/tests/qemu-iotests/common.rc b/tests/qemu-iotests/common.rc >> index 95c12577dd..f61eae73b4 100644 >> --- a/tests/qemu-iotests/common.rc >> +++ b/tests/qemu-iotests/common.rc >> @@ -857,6 +857,13 @@ _check_o_direct() >> [[ "$out" != *"O_DIRECT"* ]] >> } >> >> +_require_no_o_direct() >> +{ >> +if [ $CACHEMODE == "none" ] || [ $CACHEMODE == "directsync" ]; then >> +_notrun "not suitable for cache mode: $CACHEMODE (implies O_DIRECT)" >> +fi >> +} >> + >> _require_o_direct() >> { >> if ! _check_o_direct; then >> diff --git a/tests/qemu-iotests/tests/file-io-error >> b/tests/qemu-iotests/tests/file-io-error >> index 88ee5f670c..2b8dc7f009 100755 >> --- a/tests/qemu-iotests/tests/file-io-error >> +++ b/tests/qemu-iotests/tests/file-io-error >> @@ -40,6 +40,7 @@ trap "_cleanup; exit \$status" 0 1 2 3 15 >> # Format-agnostic (we do not use any), but we do test the file protocol >> _supported_proto file >> _require_drivers blkdebug null-co >> +_require_no_o_direct >> >> if [ "$IMGOPTSSYNTAX" = "true" ]; then >> # We need `$QEMU_IO -f file` to work; IMGOPTSSYNTAX uses --image-opts, > > Ping Ping
Re: [PATCH] iotests/277: Use iotests.sock_dir for socket creation
On 1/24/24 19:59, Denis V. Lunev wrote: > On 1/24/24 18:43, Eric Blake wrote: >> On Wed, Jan 24, 2024 at 06:22:57PM +0200, Andrey Drobyshev wrote: >>> If socket path is too long (longer than 108 bytes), socket can't be >>> opened. This might lead to failure when test dir path is long enough. >>> Make sure socket is created in iotests.sock_dir to avoid such a case. >>> >>> This commit basically aligns iotests/277 with the rest of iotests. >>> >>> Signed-off-by: Andrey Drobyshev >>> --- >>> tests/qemu-iotests/277 | 3 ++- >>> 1 file changed, 2 insertions(+), 1 deletion(-) >> Reviewed-by: Eric Blake >> >>> diff --git a/tests/qemu-iotests/277 b/tests/qemu-iotests/277 >>> index 24833e7eb6..4224202ac2 100755 >>> --- a/tests/qemu-iotests/277 >>> +++ b/tests/qemu-iotests/277 >>> @@ -27,7 +27,8 @@ from iotests import file_path, log >>> iotests.script_initialize() >>> -nbd_sock, conf_file = file_path('nbd-sock', >>> 'nbd-fault-injector.conf') >>> +conf_file = file_path('nbd-fault-injector.conf') >>> +nbd_sock = file_path('nbd-sock', base_dir=iotests.sock_dir) >>> def make_conf_file(event): >>> -- >>> 2.39.3 >>> > I would say that potentially the same code is present > in 264, it is : > disk_a, disk_b, nbd_sock = file_path('disk_a', 'disk_b', 'nbd-sock') > nbd_uri = 'nbd+unix:///?socket=' + nbd_sock Thanks, have just sent a separate fix for iotests/264. Haven't found any others. Andrey
[PATCH] iotests/264: Use iotests.sock_dir for socket creation
If socket path is too long (longer than 108 bytes), socket can't be opened. This might lead to failure when test dir path is long enough. Make sure socket is created in iotests.sock_dir to avoid such a case. This commit basically aligns iotests/264 with the rest of iotests. Signed-off-by: Andrey Drobyshev --- tests/qemu-iotests/264 | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/tests/qemu-iotests/264 b/tests/qemu-iotests/264 index c532ccd809..c6ba2754e2 100755 --- a/tests/qemu-iotests/264 +++ b/tests/qemu-iotests/264 @@ -25,7 +25,8 @@ import os import iotests from iotests import qemu_img_create, file_path, qemu_nbd_popen -disk_a, disk_b, nbd_sock = file_path('disk_a', 'disk_b', 'nbd-sock') +disk_a, disk_b = file_path('disk_a', 'disk_b') +nbd_sock = file_path('nbd-sock', base_dir=iotests.sock_dir) nbd_uri = 'nbd+unix:///?socket=' + nbd_sock wait_limit = 3.0 wait_step = 0.2 -- 2.39.3
[PATCH] iotests/277: Use iotests.sock_dir for socket creation
If socket path is too long (longer than 108 bytes), socket can't be opened. This might lead to failure when test dir path is long enough. Make sure socket is created in iotests.sock_dir to avoid such a case. This commit basically aligns iotests/277 with the rest of iotests. Signed-off-by: Andrey Drobyshev --- tests/qemu-iotests/277 | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/tests/qemu-iotests/277 b/tests/qemu-iotests/277 index 24833e7eb6..4224202ac2 100755 --- a/tests/qemu-iotests/277 +++ b/tests/qemu-iotests/277 @@ -27,7 +27,8 @@ from iotests import file_path, log iotests.script_initialize() -nbd_sock, conf_file = file_path('nbd-sock', 'nbd-fault-injector.conf') +conf_file = file_path('nbd-fault-injector.conf') +nbd_sock = file_path('nbd-sock', base_dir=iotests.sock_dir) def make_conf_file(event): -- 2.39.3
Re: [PATCH] kvm: emit GUEST_PANICKED event in case of abnormal KVM exit
On 11/1/23 18:13, Denis V. Lunev wrote: > On 11/1/23 16:23, Andrey Drobyshev wrote: >> Currently we emit GUEST_PANICKED event in case kvm_vcpu_ioctl() returns >> KVM_EXIT_SYSTEM_EVENT with the event type KVM_SYSTEM_EVENT_CRASH. Let's >> extend this scenario and emit GUEST_PANICKED in case of an abnormal KVM >> exit. That's a natural thing to do since in this case guest is no >> longer operational anyway. >> >> Signed-off-by: Andrey Drobyshev >> Acked-by: Denis V. Lunev >> --- >> accel/kvm/kvm-all.c | 19 +++ >> 1 file changed, 15 insertions(+), 4 deletions(-) >> >> diff --git a/accel/kvm/kvm-all.c b/accel/kvm/kvm-all.c >> index e39a810a4e..d74b3f0b0e 100644 >> --- a/accel/kvm/kvm-all.c >> +++ b/accel/kvm/kvm-all.c >> @@ -2816,6 +2816,14 @@ static void kvm_eat_signals(CPUState *cpu) >> } while (sigismember(, SIG_IPI)); >> } >> +static void kvm_emit_guest_crash(CPUState *cpu) >> +{ >> + kvm_cpu_synchronize_state(cpu); >> + qemu_mutex_lock_iothread(); >> + qemu_system_guest_panicked(cpu_get_crash_info(cpu)); >> + qemu_mutex_unlock_iothread(); >> +} >> + >> int kvm_cpu_exec(CPUState *cpu) >> { >> struct kvm_run *run = cpu->kvm_run; >> @@ -2969,21 +2977,24 @@ int kvm_cpu_exec(CPUState *cpu) >> ret = EXCP_INTERRUPT; >> break; >> case KVM_SYSTEM_EVENT_CRASH: >> - kvm_cpu_synchronize_state(cpu); >> - qemu_mutex_lock_iothread(); >> - qemu_system_guest_panicked(cpu_get_crash_info(cpu)); >> - qemu_mutex_unlock_iothread(); >> + kvm_emit_guest_crash(cpu); >> ret = 0; >> break; >> default: >> DPRINTF("kvm_arch_handle_exit\n"); >> ret = kvm_arch_handle_exit(cpu, run); >> + if (ret < 0) { >> + kvm_emit_guest_crash(cpu); >> + } >> break; >> } >> break; >> default: >> DPRINTF("kvm_arch_handle_exit\n"); >> ret = kvm_arch_handle_exit(cpu, run); >> + if (ret < 0) { >> + kvm_emit_guest_crash(cpu); >> + } >> break; >> } >> } while (ret == 0); > This allows to gracefully handle this problem in production > and reset the guest using on_crash action in libvirt. Ping
Re: [PATCH] block: allocate aligned write buffer for 'truncate -m full'
On 12/11/23 13:27, Denis V. Lunev wrote: > On 12/11/23 11:55, Andrey Drobyshev wrote: >> In case we're truncating an image opened with O_DIRECT, we might get >> -EINVAL on write with unaligned buffer. In particular, when running >> iotests/298 with '-nocache' we get: >> >> qemu-io: Failed to resize underlying file: Could not write zeros for >> preallocation: Invalid argument >> >> Let's just allocate the buffer using qemu_blockalign0() instead. >> >> Signed-off-by: Andrey Drobyshev >> --- >> block/file-posix.c | 4 ++-- >> 1 file changed, 2 insertions(+), 2 deletions(-) >> >> diff --git a/block/file-posix.c b/block/file-posix.c >> index b862406c71..cee8de510b 100644 >> --- a/block/file-posix.c >> +++ b/block/file-posix.c >> @@ -2354,7 +2354,7 @@ static int handle_aiocb_truncate(void *opaque) >> goto out; >> } >> - buf = g_malloc0(65536); >> + buf = qemu_blockalign0(aiocb->bs, 65536); >> seek_result = lseek(fd, current_length, SEEK_SET); >> if (seek_result < 0) { >> @@ -2413,7 +2413,7 @@ out: >> } >> } >> - g_free(buf); >> + qemu_vfree(buf); >> return result; >> } >> > Reviewed-by: Denis V. Lunev Ping
Re: [PATCH] iotests: don't run tests requiring cached writes in '-nocache' mode
On 12/11/23 15:32, Andrey Drobyshev wrote: > There're tests whose logic implies running without O_DIRECT set, > otherwise they fail when running iotests in '-nocache' mode. For these > tests let's add _require_no_o_direct() helper which can be put in the > preabmle and which makes sure '-nocache' isn't set. Use it to skip > running the following tests: > > * 271: creates files with unaligned sizes, thus producing multiple > errors like: > > qemu-io: can't open device /path/to/t.qcow2.raw: Cannot get 'write' > permission without 'resize': Image size is not a multiple of request alignment > > * 308, file-io-error: use fuse exports. Though fuse does have > 'direct-io' mode (see https://docs.kernel.org/filesystems/fuse-io.html) > we aren't using it yet, thus getting errors like: > > qemu-io: can't open device /path/to/t.qcow2.fuse: Could not open > '/path/to/t.qcow2.fuse': filesystem does not support O_DIRECT > > Signed-off-by: Andrey Drobyshev > --- > tests/qemu-iotests/271 | 1 + > tests/qemu-iotests/308 | 2 ++ > tests/qemu-iotests/common.rc | 7 +++ > tests/qemu-iotests/tests/file-io-error | 1 + > 4 files changed, 11 insertions(+) > > diff --git a/tests/qemu-iotests/271 b/tests/qemu-iotests/271 > index 59a6fafa2f..1424b6954d 100755 > --- a/tests/qemu-iotests/271 > +++ b/tests/qemu-iotests/271 > @@ -44,6 +44,7 @@ _supported_fmt qcow2 > _supported_proto file nfs > _supported_os Linux > _unsupported_imgopts extended_l2 compat=0.10 cluster_size data_file > refcount_bits=1[^0-9] > +_require_no_o_direct > > l2_offset=$((0x4)) > > diff --git a/tests/qemu-iotests/308 b/tests/qemu-iotests/308 > index de12b2b1b9..535455e5b1 100755 > --- a/tests/qemu-iotests/308 > +++ b/tests/qemu-iotests/308 > @@ -52,6 +52,8 @@ _unsupported_fmt vpc > _supported_proto file # We create the FUSE export manually > _supported_os Linux # We need /dev/urandom > > +_require_no_o_direct > + > # $1: Export ID > # $2: Options (beyond the node-name and ID) > # $3: Expected return value (defaults to 'return') > diff --git a/tests/qemu-iotests/common.rc b/tests/qemu-iotests/common.rc > index 95c12577dd..f61eae73b4 100644 > --- a/tests/qemu-iotests/common.rc > +++ b/tests/qemu-iotests/common.rc > @@ -857,6 +857,13 @@ _check_o_direct() > [[ "$out" != *"O_DIRECT"* ]] > } > > +_require_no_o_direct() > +{ > +if [ $CACHEMODE == "none" ] || [ $CACHEMODE == "directsync" ]; then > +_notrun "not suitable for cache mode: $CACHEMODE (implies O_DIRECT)" > +fi > +} > + > _require_o_direct() > { > if ! _check_o_direct; then > diff --git a/tests/qemu-iotests/tests/file-io-error > b/tests/qemu-iotests/tests/file-io-error > index 88ee5f670c..2b8dc7f009 100755 > --- a/tests/qemu-iotests/tests/file-io-error > +++ b/tests/qemu-iotests/tests/file-io-error > @@ -40,6 +40,7 @@ trap "_cleanup; exit \$status" 0 1 2 3 15 > # Format-agnostic (we do not use any), but we do test the file protocol > _supported_proto file > _require_drivers blkdebug null-co > +_require_no_o_direct > > if [ "$IMGOPTSSYNTAX" = "true" ]; then > # We need `$QEMU_IO -f file` to work; IMGOPTSSYNTAX uses --image-opts, Ping
[PATCH] iotests: don't run tests requiring cached writes in '-nocache' mode
There're tests whose logic implies running without O_DIRECT set, otherwise they fail when running iotests in '-nocache' mode. For these tests let's add _require_no_o_direct() helper which can be put in the preabmle and which makes sure '-nocache' isn't set. Use it to skip running the following tests: * 271: creates files with unaligned sizes, thus producing multiple errors like: qemu-io: can't open device /path/to/t.qcow2.raw: Cannot get 'write' permission without 'resize': Image size is not a multiple of request alignment * 308, file-io-error: use fuse exports. Though fuse does have 'direct-io' mode (see https://docs.kernel.org/filesystems/fuse-io.html) we aren't using it yet, thus getting errors like: qemu-io: can't open device /path/to/t.qcow2.fuse: Could not open '/path/to/t.qcow2.fuse': filesystem does not support O_DIRECT Signed-off-by: Andrey Drobyshev --- tests/qemu-iotests/271 | 1 + tests/qemu-iotests/308 | 2 ++ tests/qemu-iotests/common.rc | 7 +++ tests/qemu-iotests/tests/file-io-error | 1 + 4 files changed, 11 insertions(+) diff --git a/tests/qemu-iotests/271 b/tests/qemu-iotests/271 index 59a6fafa2f..1424b6954d 100755 --- a/tests/qemu-iotests/271 +++ b/tests/qemu-iotests/271 @@ -44,6 +44,7 @@ _supported_fmt qcow2 _supported_proto file nfs _supported_os Linux _unsupported_imgopts extended_l2 compat=0.10 cluster_size data_file refcount_bits=1[^0-9] +_require_no_o_direct l2_offset=$((0x4)) diff --git a/tests/qemu-iotests/308 b/tests/qemu-iotests/308 index de12b2b1b9..535455e5b1 100755 --- a/tests/qemu-iotests/308 +++ b/tests/qemu-iotests/308 @@ -52,6 +52,8 @@ _unsupported_fmt vpc _supported_proto file # We create the FUSE export manually _supported_os Linux # We need /dev/urandom +_require_no_o_direct + # $1: Export ID # $2: Options (beyond the node-name and ID) # $3: Expected return value (defaults to 'return') diff --git a/tests/qemu-iotests/common.rc b/tests/qemu-iotests/common.rc index 95c12577dd..f61eae73b4 100644 --- a/tests/qemu-iotests/common.rc +++ b/tests/qemu-iotests/common.rc @@ -857,6 +857,13 @@ _check_o_direct() [[ "$out" != *"O_DIRECT"* ]] } +_require_no_o_direct() +{ +if [ $CACHEMODE == "none" ] || [ $CACHEMODE == "directsync" ]; then +_notrun "not suitable for cache mode: $CACHEMODE (implies O_DIRECT)" +fi +} + _require_o_direct() { if ! _check_o_direct; then diff --git a/tests/qemu-iotests/tests/file-io-error b/tests/qemu-iotests/tests/file-io-error index 88ee5f670c..2b8dc7f009 100755 --- a/tests/qemu-iotests/tests/file-io-error +++ b/tests/qemu-iotests/tests/file-io-error @@ -40,6 +40,7 @@ trap "_cleanup; exit \$status" 0 1 2 3 15 # Format-agnostic (we do not use any), but we do test the file protocol _supported_proto file _require_drivers blkdebug null-co +_require_no_o_direct if [ "$IMGOPTSSYNTAX" = "true" ]; then # We need `$QEMU_IO -f file` to work; IMGOPTSSYNTAX uses --image-opts, -- 2.39.3
[PATCH] block: allocate aligned write buffer for 'truncate -m full'
In case we're truncating an image opened with O_DIRECT, we might get -EINVAL on write with unaligned buffer. In particular, when running iotests/298 with '-nocache' we get: qemu-io: Failed to resize underlying file: Could not write zeros for preallocation: Invalid argument Let's just allocate the buffer using qemu_blockalign0() instead. Signed-off-by: Andrey Drobyshev --- block/file-posix.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/block/file-posix.c b/block/file-posix.c index b862406c71..cee8de510b 100644 --- a/block/file-posix.c +++ b/block/file-posix.c @@ -2354,7 +2354,7 @@ static int handle_aiocb_truncate(void *opaque) goto out; } -buf = g_malloc0(65536); +buf = qemu_blockalign0(aiocb->bs, 65536); seek_result = lseek(fd, current_length, SEEK_SET); if (seek_result < 0) { @@ -2413,7 +2413,7 @@ out: } } -g_free(buf); +qemu_vfree(buf); return result; } -- 2.39.3
Re: [PATCH] Revert "test/qga: use G_TEST_DIR to locate os-release test file"
On 12/4/23 19:09, Marc-André Lureau wrote: > Hi > > On Mon, Dec 4, 2023 at 9:01 PM Andrey Drobyshev > wrote: >> >> On 12/4/23 18:51, Marc-André Lureau wrote: >>> Hi >>> >>> On Mon, Dec 4, 2023 at 8:33 PM Andrey Drobyshev >>> wrote: >>>> >>>> Since the commit a85d09269b QGA_OS_RELEASE variable points to the path >>>> relative to the build dir. Then on qemu-ga startup this path can't be >>>> found as qemu-ga cwd is somewhere else, which leads to the test failure: >>>> >>>> # ./tests/unit/test-qga -p /qga/guest-get-osinfo >>>> # random seed: R02S3a90c22d77ff1070fbd844f4959cf4a4 >>>> # Start of qga tests >>>> ** >>>> ERROR:../tests/unit/test-qga.c:906:test_qga_guest_get_osinfo: 'str' >>>> should not be NULL >>>> Bail out! ERROR:../tests/unit/test-qga.c:906:test_qga_guest_get_osinfo: >>>> 'str' should not be NULL >>>> >>>> Let's obtain the absolute path again. >>> >>> Can you detail how the build and the test is done? >>> >> >> Simple as: >> >>> ./configure --cc=gcc --target-list=x86_64-softmmu --enable-guest-agent && >>> make -j16 >>> cd build; tests/unit/test-qga -p /qga/guest-get-osinfo >> >> >>> If I recall correctly, this change was done in order to move qga to a >>> subproject(), but isn't strictly required at this point. Although I >>> believe it is more correct to lookup test data relative to >>> G_TEST_DIST. >>> >> >> Then we'd have to change cwd of qemu-ga at startup to ensure relative >> paths work. Right now (with the initial change) it appears broken. > > By reverting the patch, it is _still_ broken if you run the test > manually from a different directory (say from tests/unit for example) > > With G_TEST_DIST, and proper testing environment, it works from any directory. > No, it seems to be failing as well, only earlier. Before the revert: > cd build/tests/unit; ./test-qga > # random seed: R02S450ef942c699b5af6dff48f9c5b73b33 > ** > ERROR:../tests/unit/test-qga.c:79:fixture_setup: assertion failed (error == > NULL): Failed to execute child process “$SRC/build/tests/unit/qga/qemu-ga” > (No such file or directory) (g-exec-error-quark, 8) > Bail out! ERROR:../tests/unit/test-qga.c:79:fixture_setup: assertion failed > (error == NULL): Failed to execute child process > “$SRC/build/tests/unit/qga/qemu-ga” (No such file or directory) > (g-exec-error-quark, 8) But maybe my testing environment isn't proper? > Tests are not meant to be run manually, you should run them through > the test runner: meson test -v test-qga > That's a good point, but I just found it suspicious that this is literally the *only* case of the *only* unit test which fails (when run directly from ./build). Could we fix the direct execution as well then? Btw test runner also cannot be run from just any directory, otherwise it complains: > meson test -v test-qga > > ERROR: No such build data file as > '$SRC/build/tests/unit/meson-private/build.dat'. >> >>>> >>>> This reverts commit a85d09269bb1a7071d3ce0f2957e3ca9dba7c047. >>>> >>>> Signed-off-by: Andrey Drobyshev >>>> --- >>>> tests/unit/test-qga.c | 6 -- >>>> 1 file changed, 4 insertions(+), 2 deletions(-) >>>> >>>> diff --git a/tests/unit/test-qga.c b/tests/unit/test-qga.c >>>> index 671e83cb86..47cf5e30ec 100644 >>>> --- a/tests/unit/test-qga.c >>>> +++ b/tests/unit/test-qga.c >>>> @@ -1034,10 +1034,12 @@ static void >>>> test_qga_guest_get_osinfo(gconstpointer data) >>>> g_autoptr(QDict) ret = NULL; >>>> char *env[2]; >>>> QDict *val; >>>> +g_autofree gchar *cwd = NULL; >>>> >>>> +cwd = g_get_current_dir(); >>>> env[0] = g_strdup_printf( >>>> -"QGA_OS_RELEASE=%s%c..%cdata%ctest-qga-os-release", >>>> -g_test_get_dir(G_TEST_DIST), G_DIR_SEPARATOR, G_DIR_SEPARATOR, >>>> G_DIR_SEPARATOR); >>>> +"QGA_OS_RELEASE=%s%ctests%cdata%ctest-qga-os-release", >>>> +cwd, G_DIR_SEPARATOR, G_DIR_SEPARATOR, G_DIR_SEPARATOR); >>>> env[1] = NULL; >>>> fixture_setup(, NULL, env); >>>> >>>> -- >>>> 2.39.3 >>>> >>>> >>> >>> >> > >
Re: [PATCH] Revert "test/qga: use G_TEST_DIR to locate os-release test file"
On 12/4/23 18:51, Marc-André Lureau wrote: > Hi > > On Mon, Dec 4, 2023 at 8:33 PM Andrey Drobyshev > wrote: >> >> Since the commit a85d09269b QGA_OS_RELEASE variable points to the path >> relative to the build dir. Then on qemu-ga startup this path can't be >> found as qemu-ga cwd is somewhere else, which leads to the test failure: >> >> # ./tests/unit/test-qga -p /qga/guest-get-osinfo >> # random seed: R02S3a90c22d77ff1070fbd844f4959cf4a4 >> # Start of qga tests >> ** >> ERROR:../tests/unit/test-qga.c:906:test_qga_guest_get_osinfo: 'str' should >> not be NULL >> Bail out! ERROR:../tests/unit/test-qga.c:906:test_qga_guest_get_osinfo: >> 'str' should not be NULL >> >> Let's obtain the absolute path again. > > Can you detail how the build and the test is done? > Simple as: > ./configure --cc=gcc --target-list=x86_64-softmmu --enable-guest-agent && > make -j16 > cd build; tests/unit/test-qga -p /qga/guest-get-osinfo > If I recall correctly, this change was done in order to move qga to a > subproject(), but isn't strictly required at this point. Although I > believe it is more correct to lookup test data relative to > G_TEST_DIST. > Then we'd have to change cwd of qemu-ga at startup to ensure relative paths work. Right now (with the initial change) it appears broken. >> >> This reverts commit a85d09269bb1a7071d3ce0f2957e3ca9dba7c047. >> >> Signed-off-by: Andrey Drobyshev >> --- >> tests/unit/test-qga.c | 6 -- >> 1 file changed, 4 insertions(+), 2 deletions(-) >> >> diff --git a/tests/unit/test-qga.c b/tests/unit/test-qga.c >> index 671e83cb86..47cf5e30ec 100644 >> --- a/tests/unit/test-qga.c >> +++ b/tests/unit/test-qga.c >> @@ -1034,10 +1034,12 @@ static void test_qga_guest_get_osinfo(gconstpointer >> data) >> g_autoptr(QDict) ret = NULL; >> char *env[2]; >> QDict *val; >> +g_autofree gchar *cwd = NULL; >> >> +cwd = g_get_current_dir(); >> env[0] = g_strdup_printf( >> -"QGA_OS_RELEASE=%s%c..%cdata%ctest-qga-os-release", >> -g_test_get_dir(G_TEST_DIST), G_DIR_SEPARATOR, G_DIR_SEPARATOR, >> G_DIR_SEPARATOR); >> +"QGA_OS_RELEASE=%s%ctests%cdata%ctest-qga-os-release", >> +cwd, G_DIR_SEPARATOR, G_DIR_SEPARATOR, G_DIR_SEPARATOR); >> env[1] = NULL; >> fixture_setup(, NULL, env); >> >> -- >> 2.39.3 >> >> > >
[PATCH] Revert "test/qga: use G_TEST_DIR to locate os-release test file"
Since the commit a85d09269b QGA_OS_RELEASE variable points to the path relative to the build dir. Then on qemu-ga startup this path can't be found as qemu-ga cwd is somewhere else, which leads to the test failure: # ./tests/unit/test-qga -p /qga/guest-get-osinfo # random seed: R02S3a90c22d77ff1070fbd844f4959cf4a4 # Start of qga tests ** ERROR:../tests/unit/test-qga.c:906:test_qga_guest_get_osinfo: 'str' should not be NULL Bail out! ERROR:../tests/unit/test-qga.c:906:test_qga_guest_get_osinfo: 'str' should not be NULL Let's obtain the absolute path again. This reverts commit a85d09269bb1a7071d3ce0f2957e3ca9dba7c047. Signed-off-by: Andrey Drobyshev --- tests/unit/test-qga.c | 6 -- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/tests/unit/test-qga.c b/tests/unit/test-qga.c index 671e83cb86..47cf5e30ec 100644 --- a/tests/unit/test-qga.c +++ b/tests/unit/test-qga.c @@ -1034,10 +1034,12 @@ static void test_qga_guest_get_osinfo(gconstpointer data) g_autoptr(QDict) ret = NULL; char *env[2]; QDict *val; +g_autofree gchar *cwd = NULL; +cwd = g_get_current_dir(); env[0] = g_strdup_printf( -"QGA_OS_RELEASE=%s%c..%cdata%ctest-qga-os-release", -g_test_get_dir(G_TEST_DIST), G_DIR_SEPARATOR, G_DIR_SEPARATOR, G_DIR_SEPARATOR); +"QGA_OS_RELEASE=%s%ctests%cdata%ctest-qga-os-release", +cwd, G_DIR_SEPARATOR, G_DIR_SEPARATOR, G_DIR_SEPARATOR); env[1] = NULL; fixture_setup(, NULL, env); -- 2.39.3
Re: [PATCH] kvm: emit GUEST_PANICKED event in case of abnormal KVM exit
On 11/1/23 18:13, Denis V. Lunev wrote: > On 11/1/23 16:23, Andrey Drobyshev wrote: >> Currently we emit GUEST_PANICKED event in case kvm_vcpu_ioctl() returns >> KVM_EXIT_SYSTEM_EVENT with the event type KVM_SYSTEM_EVENT_CRASH. Let's >> extend this scenario and emit GUEST_PANICKED in case of an abnormal KVM >> exit. That's a natural thing to do since in this case guest is no >> longer operational anyway. >> >> Signed-off-by: Andrey Drobyshev >> Acked-by: Denis V. Lunev >> --- >> accel/kvm/kvm-all.c | 19 +++ >> 1 file changed, 15 insertions(+), 4 deletions(-) >> >> diff --git a/accel/kvm/kvm-all.c b/accel/kvm/kvm-all.c >> index e39a810a4e..d74b3f0b0e 100644 >> --- a/accel/kvm/kvm-all.c >> +++ b/accel/kvm/kvm-all.c >> @@ -2816,6 +2816,14 @@ static void kvm_eat_signals(CPUState *cpu) >> } while (sigismember(, SIG_IPI)); >> } >> +static void kvm_emit_guest_crash(CPUState *cpu) >> +{ >> + kvm_cpu_synchronize_state(cpu); >> + qemu_mutex_lock_iothread(); >> + qemu_system_guest_panicked(cpu_get_crash_info(cpu)); >> + qemu_mutex_unlock_iothread(); >> +} >> + >> int kvm_cpu_exec(CPUState *cpu) >> { >> struct kvm_run *run = cpu->kvm_run; >> @@ -2969,21 +2977,24 @@ int kvm_cpu_exec(CPUState *cpu) >> ret = EXCP_INTERRUPT; >> break; >> case KVM_SYSTEM_EVENT_CRASH: >> - kvm_cpu_synchronize_state(cpu); >> - qemu_mutex_lock_iothread(); >> - qemu_system_guest_panicked(cpu_get_crash_info(cpu)); >> - qemu_mutex_unlock_iothread(); >> + kvm_emit_guest_crash(cpu); >> ret = 0; >> break; >> default: >> DPRINTF("kvm_arch_handle_exit\n"); >> ret = kvm_arch_handle_exit(cpu, run); >> + if (ret < 0) { >> + kvm_emit_guest_crash(cpu); >> + } >> break; >> } >> break; >> default: >> DPRINTF("kvm_arch_handle_exit\n"); >> ret = kvm_arch_handle_exit(cpu, run); >> + if (ret < 0) { >> + kvm_emit_guest_crash(cpu); >> + } >> break; >> } >> } while (ret == 0); > This allows to gracefully handle this problem in production > and reset the guest using on_crash action in libvirt. Ping
[PATCH] iotests: fix default MT detection
MT is being detected based on "-M help" output, and we're searching for the line ending with " (default)". However, in downstream one of the MTs marked as deprecated might become the default, in which case this logic breaks as the line would now end with " (default) (deprecated)". To fix potential issues here, let's relax that requirement and detect the mere presence of " (default)" line instead. Signed-off-by: Andrey Drobyshev --- tests/qemu-iotests/testenv.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/qemu-iotests/testenv.py b/tests/qemu-iotests/testenv.py index e67ebd254b..3ff38f2661 100644 --- a/tests/qemu-iotests/testenv.py +++ b/tests/qemu-iotests/testenv.py @@ -40,7 +40,7 @@ def get_default_machine(qemu_prog: str) -> str: machines = outp.split('\n') try: -default_machine = next(m for m in machines if m.endswith(' (default)')) +default_machine = next(m for m in machines if ' (default)' in m) except StopIteration: return '' default_machine = default_machine.split(' ', 1)[0] -- 2.39.3
Re: [PATCH 4/7] qcow2: make subclusters discardable
On 10/31/23 18:33, Hanna Czenczek wrote: > (Sorry, opened another reply window, forgot I already had one open...) > > On 20.10.23 23:56, Andrey Drobyshev wrote: >> This commit makes the discard operation work on the subcluster level >> rather than cluster level. It introduces discard_l2_subclusters() >> function and makes use of it in qcow2 discard implementation, much like >> it's done with zero_in_l2_slice() / zero_l2_subclusters(). It also >> changes the qcow2 driver pdiscard_alignment to subcluster_size. That >> way subcluster-aligned discards lead to actual fallocate(PUNCH_HOLE) >> operation and free host disk space. >> >> This feature will let us gain additional disk space on guest >> TRIM/discard requests, especially when using large enough clusters >> (1M, 2M) with subclusters enabled. >> >> Signed-off-by: Andrey Drobyshev >> --- >> block/qcow2-cluster.c | 100 -- >> block/qcow2.c | 8 ++-- >> 2 files changed, 101 insertions(+), 7 deletions(-) >> >> diff --git a/block/qcow2-cluster.c b/block/qcow2-cluster.c >> index 7c6fa5524c..cf40f2dc12 100644 >> --- a/block/qcow2-cluster.c >> +++ b/block/qcow2-cluster.c >> @@ -2042,6 +2042,74 @@ discard_in_l2_slice(BlockDriverState *bs, >> uint64_t offset, uint64_t nb_clusters, >> return nb_clusters; >> } >> +static int coroutine_fn GRAPH_RDLOCK >> +discard_l2_subclusters(BlockDriverState *bs, uint64_t offset, >> + uint64_t nb_subclusters, >> + enum qcow2_discard_type type, >> + bool full_discard, >> + SubClusterRangeInfo *pscri) >> +{ >> + BDRVQcow2State *s = bs->opaque; >> + uint64_t new_l2_bitmap, l2_bitmap_mask; >> + int ret, sc = offset_to_sc_index(s, offset); >> + SubClusterRangeInfo scri = { 0 }; >> + >> + if (!pscri) { >> + ret = get_sc_range_info(bs, offset, nb_subclusters, ); >> + if (ret < 0) { >> + goto out; >> + } >> + } else { >> + scri = *pscri; > > Allowing to takes this from the caller sounds dangerous, considering we > need to track who takes care of freeing scri.l2_slice. > >> + } >> + >> + l2_bitmap_mask = QCOW_OFLAG_SUB_ALLOC_RANGE(sc, sc + >> nb_subclusters); >> + new_l2_bitmap = scri.l2_bitmap; >> + new_l2_bitmap &= ~l2_bitmap_mask; >> + >> + /* >> + * If there're no allocated subclusters left, we might as well >> discard >> + * the entire cluster. That way we'd also update the refcount >> table. >> + */ >> + if (!(new_l2_bitmap & QCOW_L2_BITMAP_ALL_ALLOC)) { > > What if there are subclusters in the cluster that are marked as zero, > outside of the discarded range? It sounds wrong to apply a discard with > either full_discard set or cleared to them. > Hmmm, then I think before falling to this path we should either: 1. Make sure full_discard=false. That is directly implied from what you said in your other mail: discarding with full_discard=false is equivalent to zeroizing; 2. Check that no subclusters within this cluster are explicitly marked as zeroes. 3. Check that either 1) or 2) is true (if ((1) || (2))). For now I like option 3) better. >> + return discard_in_l2_slice(bs, >> + QEMU_ALIGN_DOWN(offset, >> s->cluster_size), >> + 1, type, full_discard); >> + } >> + >> + /* >> + * Full discard means we fall through to the backing file, thus >> we only >> + * need to mark the subclusters as deallocated. > > I think it also means we need to clear the zero bits. > Yes, that seems right. > Hanna > > [...]
Re: [PATCH 5/7] qcow2: zero_l2_subclusters: fall through to discard operation when requested
On 11/3/23 17:19, Hanna Czenczek wrote: > On 20.10.23 23:56, Andrey Drobyshev wrote: >> When zeroizing subclusters within single cluster, detect usage of the >> BDRV_REQ_MAY_UNMAP flag and fall through to the subcluster-based discard >> operation, much like it's done with the cluster-based discards. That >> way subcluster-aligned operations "qemu-io -c 'write -z -u ...'" will >> lead to actual unmap. > > Ever since the introduction of discard-no-unref, I wonder whether that > is actually an advantage or not. I can’t think of a reason why it’d be > advantageous to drop the allocation. You mean omitting the UNallocation on the discard path? > > On the other hand, zero_in_l2_slice() does it indeed, so consistency is > a reasonable argument. > >> Signed-off-by: Andrey Drobyshev >> --- >> block/qcow2-cluster.c | 17 ++--- >> 1 file changed, 14 insertions(+), 3 deletions(-) >> >> diff --git a/block/qcow2-cluster.c b/block/qcow2-cluster.c >> index cf40f2dc12..040251f2c3 100644 >> --- a/block/qcow2-cluster.c >> +++ b/block/qcow2-cluster.c >> @@ -2242,7 +2242,7 @@ zero_l2_subclusters(BlockDriverState *bs, >> uint64_t offset, >> unsigned nb_subclusters, int flags) >> { >> BDRVQcow2State *s = bs->opaque; >> - uint64_t new_l2_bitmap; >> + uint64_t new_l2_bitmap, l2_bitmap_mask; >> int ret, sc = offset_to_sc_index(s, offset); >> SubClusterRangeInfo scri = { 0 }; >> @@ -2251,9 +2251,10 @@ zero_l2_subclusters(BlockDriverState *bs, >> uint64_t offset, >> goto out; >> } >> + l2_bitmap_mask = QCOW_OFLAG_SUB_ALLOC_RANGE(sc, sc + >> nb_subclusters); > > “l2_bitmap_mask” already wasn’t a great name in patch 4 because it > doesn’t say what mask it is, but in patch 4, there was at least only one > relevant mask. Here, we have two (ALLOC_RANGE and ZERO_RANGE), so the > name should reflect what kind of mask it is. > Noted. >> new_l2_bitmap = scri.l2_bitmap; >> - new_l2_bitmap |= QCOW_OFLAG_SUB_ZERO_RANGE(sc, sc + >> nb_subclusters); >> - new_l2_bitmap &= ~QCOW_OFLAG_SUB_ALLOC_RANGE(sc, sc + >> nb_subclusters); >> + new_l2_bitmap |= QCOW_OFLAG_SUB_ZERO_RANGE(sc, sc + nb_subclusters); >> + new_l2_bitmap &= ~l2_bitmap_mask; >> /* >> * If there're no non-zero subclusters left, we might as well >> zeroize >> @@ -2266,6 +2267,16 @@ zero_l2_subclusters(BlockDriverState *bs, >> uint64_t offset, >> 1, flags); >> } >> + /* >> + * If the request allows discarding subclusters and they're actually >> + * allocated, we go down the discard path since after the discard >> + * operation the subclusters are going to be read as zeroes anyway. >> + */ >> + if ((flags & BDRV_REQ_MAY_UNMAP) && (scri.l2_bitmap & >> l2_bitmap_mask)) { >> + return discard_l2_subclusters(bs, offset, nb_subclusters, >> + QCOW2_DISCARD_REQUEST, false, >> ); >> + } > > I wonder why it matters whether any subclusters are allocated, i.e. why > can’t we just immediately use discard_l2_subclusters() whenever > BDRV_REQ_MAY_UNMAP is set, without even fetching SCRI (that would also > save us passing SCRI here, which I’ve already said on patch 4, I don’t > find great). > Yes, this way we'd indeed be able to get rid of passing an additional argument. > Doesn’t discard_l2_subclusters() guarantee the clusters read as zero > when full_discard=false? There is this case where it won’t mark the > subclusters as zero if there is no backing file, and none of the > subclusters is allocated. But still, (A) those subclusters will read as > zero, and (B) if there is a problem there, why don’t we just always mark > those subclusters as zero? This optimization only has effect when the > guest discards/zeroes subclusters (not whole clusters) that were already > discarded, so sounds miniscule. > > Hanna > Good question. I also can't think of any issues when zeroizing/discarding already unallocated clusters. Essentially you're saying that zeroizing subclusters is equivalent to discard_l2_subclusters(full_discard=false). Then in discard_l2_subclusters() implementation we may mark the subclusters as zeroes regardless of their allocation status in case of full_discard=false. But when dealing with the whole clusters this logic isn't quite applicable cause if the l2 entry isn't allocated at all there's no point marking it as zero. Correct? > [...]
Re: [PATCH 7/7] iotests/271: check disk usage on subcluster-based discard/unmap
On 11/3/23 17:51, Hanna Czenczek wrote: > On 20.10.23 23:56, Andrey Drobyshev wrote: >> Add _verify_du_delta() checker which is used to check that real disk >> usage delta meets the expectations. For now we use it for checking that >> subcluster-based discard/unmap operations lead to actual disk usage >> decrease (i.e. PUNCH_HOLE operation is performed). > > I’m not too happy about checking the disk usage because that relies on > the underlying filesystem actually accepting and executing the unmap. > Why is it not enough to check the L2 bitmap? > > …Coming back later (I had to fix the missing `ret = ` I mentioned in > patch 2, or this test would hang, so I couldn’t run it at first), I note > that checking the disk usage in fact doesn’t work on tmpfs. I usually > run the iotests in tmpfs, so that’s not great. > My original idea was to make sure that the PUNCH_HOLE operation did indeed take place, i.e. there was an actual discard. For instance, currently the discard operation initiated by qemu-io is called with the QCOW2_DISCARD_REQUEST discard type, but if some other type is passed by mistake, qcow2_queue_discard() won't be called, and though the subclusters will be marked unallocated in L2 the data will still be there. Not quite what we expect from discard operation. BTW checking the disk usage on tmpfs works on my machine: > # cd /tmp; df -Th /tmp > Filesystem Type Size Used Avail Use% Mounted on > tmpfs tmpfs 32G 2.5M 32G 1% /tmp > # BUILD=/root/src/qemu/master/build > # $BUILD/qemu-img create -f qcow2 -o extended_l2=on img.qcow2 1M > Formatting 'img.qcow2', fmt=qcow2 cluster_size=65536 extended_l2=on > compression_type=zlib size=1048576 lazy_refcounts=off refcount_bits=16 > # $BUILD/qemu-io -c 'write -q 0 128k' img.qcow2 > # du --block-size=1 img.qcow2 > 397312 img.qcow2 > # $BUILD/qemu-io -f qcow2 -c 'discard -q 0 8k' img.qcow2 > # du --block-size=1 img.qcow2 > 389120 img.qcow2 > # $BUILD/qemu-io -f qcow2 -c 'discard -q 8k 120k' img.qcow2 > # du --block-size=1 img.qcow2 > 266240 img.qcow2 I'm wondering what this might depend on and can't we overcome this? >> Also add separate test case for discarding particular subcluster within >> one cluster. >> >> Signed-off-by: Andrey Drobyshev >> --- >> tests/qemu-iotests/271 | 25 - >> tests/qemu-iotests/271.out | 2 ++ >> 2 files changed, 26 insertions(+), 1 deletion(-) >> >> diff --git a/tests/qemu-iotests/271 b/tests/qemu-iotests/271 >> index c7c2cadda0..5fcb209f5f 100755 >> --- a/tests/qemu-iotests/271 >> +++ b/tests/qemu-iotests/271 >> @@ -81,6 +81,15 @@ _verify_l2_bitmap() >> fi >> } >> +# Check disk usage delta after a discard/unmap operation >> +# _verify_du_delta $before $after $expected_delta >> +_verify_du_delta() >> +{ >> + if [ $(($1 - $2)) -ne $3 ]; then >> + printf "ERROR: unexpected delta: $1 - $2 = $(($1 - $2)) != $3\n" >> + fi >> +} >> + >> # This should be called as _run_test c=XXX sc=XXX off=XXX len=XXX >> cmd=XXX >> # c: cluster number (0 if unset) >> # sc: subcluster number inside cluster @c (0 if unset) >> @@ -198,9 +207,12 @@ for use_backing_file in yes no; do >> alloc="$(seq 0 31)"; zero="" >> _run_test sc=0 len=64k >> - ### Zero and unmap half of cluster #0 (this won't unmap it) >> + ### Zero and unmap half of cluster #0 (this will unmap it) > > I think “it” refers to the cluster, and it is not unmapped. This test > case does not use a discard, but write -z instead, so it worked before. > (The L2 bitmap shown in the output doesn’t change, so functionally, this > patch series didn’t change this case.) > >From the _run_test() implementation: > # cmd: the command to pass to qemu-io, must be one of > > # write-> write > > # zero -> write -z > > # unmap-> write -z -u <- > > # compress -> write -c > > # discard -> discard > > _run_test() So it actually uses 'write -z -u', and we end up with an actual unmap. I agree that the l2 bitmap doesn't change, that's why I specifically added disk usage check to catch the changed functionality. >> alloc="$(seq 16 31)"; zero="$(seq 0 15)" >> + before=$(disk_usage "$TEST_I
Re: [PATCH 4/7] qcow2: make subclusters discardable
On 10/31/23 18:32, Hanna Czenczek wrote: > On 20.10.23 23:56, Andrey Drobyshev wrote: >> This commit makes the discard operation work on the subcluster level >> rather than cluster level. It introduces discard_l2_subclusters() >> function and makes use of it in qcow2 discard implementation, much like >> it's done with zero_in_l2_slice() / zero_l2_subclusters(). It also >> changes the qcow2 driver pdiscard_alignment to subcluster_size. That >> way subcluster-aligned discards lead to actual fallocate(PUNCH_HOLE) >> operation and free host disk space. >> >> This feature will let us gain additional disk space on guest >> TRIM/discard requests, especially when using large enough clusters >> (1M, 2M) with subclusters enabled. >> >> Signed-off-by: Andrey Drobyshev >> --- >> block/qcow2-cluster.c | 100 -- >> block/qcow2.c | 8 ++-- >> 2 files changed, 101 insertions(+), 7 deletions(-) >> >> diff --git a/block/qcow2-cluster.c b/block/qcow2-cluster.c >> index 7c6fa5524c..cf40f2dc12 100644 >> --- a/block/qcow2-cluster.c >> +++ b/block/qcow2-cluster.c > > [...] > >> + if (scri.l2_bitmap != new_l2_bitmap) { >> + set_l2_bitmap(s, scri.l2_slice, scri.l2_index, new_l2_bitmap); >> + qcow2_cache_entry_mark_dirty(s->l2_table_cache, scri.l2_slice); >> + } >> + >> + if (s->discard_passthrough[type]) { >> + qcow2_queue_discard(bs, (scri.l2_entry & L2E_OFFSET_MASK) + >> + offset_into_cluster(s, offset), >> + nb_subclusters * s->subcluster_size); > > Are we sure that the cluster is allocated, i.e. that scri.l2_entry & > L2E_OFFSET_MASK != 0? > I think it must be illegal to mark the entire cluster as unallocated and yet mark some of the subclusters as allocated. So in the case you describe we should detect it earlier in the '!(new_l2_bitmap & QCOW_L2_BITMAP_ALL_ALLOC)' condition and fall back to discard_in_l2_slice(). > As a side note, I guess discard_in_l2_slice() should also use > qcow2_queue_discard() instead of bdrv_pdiscard() then. > Yes, looks like it. I'll make it a separate patch then. >> + } >> + >> + ret = 0; >> +out: >> + qcow2_cache_put(s->l2_table_cache, (void **) _slice); >> + >> + return ret; >> +} >> + >> int qcow2_cluster_discard(BlockDriverState *bs, uint64_t offset, >> uint64_t bytes, enum qcow2_discard_type type, >> bool full_discard) >> @@ -2049,19 +2117,36 @@ int qcow2_cluster_discard(BlockDriverState >> *bs, uint64_t offset, >> BDRVQcow2State *s = bs->opaque; >> uint64_t end_offset = offset + bytes; >> uint64_t nb_clusters; >> + unsigned head, tail; >> int64_t cleared; >> int ret; >> /* Caller must pass aligned values, except at image end */ >> - assert(QEMU_IS_ALIGNED(offset, s->cluster_size)); >> - assert(QEMU_IS_ALIGNED(end_offset, s->cluster_size) || >> + assert(QEMU_IS_ALIGNED(offset, s->subcluster_size)); >> + assert(QEMU_IS_ALIGNED(end_offset, s->subcluster_size) || >> end_offset == bs->total_sectors << BDRV_SECTOR_BITS); >> - nb_clusters = size_to_clusters(s, bytes); >> + head = MIN(end_offset, ROUND_UP(offset, s->cluster_size)) - offset; >> + offset += head; >> + >> + tail = (end_offset >= bs->total_sectors << BDRV_SECTOR_BITS) ? 0 : >> + end_offset - MAX(offset, start_of_cluster(s, end_offset)); >> + end_offset -= tail; >> s->cache_discards = true; >> + if (head) { >> + ret = discard_l2_subclusters(bs, offset - head, >> + size_to_subclusters(s, head), type, >> + full_discard, NULL); >> + if (ret < 0) { >> + goto fail; >> + } >> + } >> + >> /* Each L2 slice is handled by its own loop iteration */ >> + nb_clusters = size_to_clusters(s, end_offset - offset); >> + >> while (nb_clusters > 0) { > > I think the comment should stay attached to the `while`. > Agreed. > Hanna > >> cleared = discard_in_l2_slice(bs, offset, nb_clusters, type, >> full_discard); >
Re: [PATCH 7/7] iotests/271: check disk usage on subcluster-based discard/unmap
On 11/3/23 17:59, Hanna Czenczek wrote: > On 03.11.23 16:51, Hanna Czenczek wrote: >> On 20.10.23 23:56, Andrey Drobyshev wrote: > > [...] > >>> @@ -528,6 +543,14 @@ for use_backing_file in yes no; do >>> else >>> _make_test_img -o extended_l2=on 1M >>> fi >>> + # Write cluster #0 and discard its subclusters #0-#3 >>> + $QEMU_IO -c 'write -q 0 64k' "$TEST_IMG" >>> + before=$(disk_usage "$TEST_IMG") >>> + $QEMU_IO -c 'discard -q 0 8k' "$TEST_IMG" >>> + after=$(disk_usage "$TEST_IMG") >>> + _verify_du_delta $before $after 8192 >>> + alloc="$(seq 4 31)"; zero="$(seq 0 3)" >>> + _verify_l2_bitmap 0 >>> # Write clusters #0-#2 and then discard them >>> $QEMU_IO -c 'write -q 0 128k' "$TEST_IMG" >>> $QEMU_IO -c 'discard -q 0 128k' "$TEST_IMG" >> >> Similarly to above, I think it would be good if we combined this >> following case with the one you added, i.e. to write 128k from the >> beginning, drop the write here, and change the discard to be “discard >> -q 8k 120k”, i.e. skip the subclusters we have already discarded, to >> see that this is still combined to discard the whole first cluster. >> >> ...Ah, see, and when I try this, the following assertion fails: >> >> qemu-io: ../block/qcow2-cache.c:156: qcow2_cache_destroy: Assertion >> `c->entries[i].ref == 0' failed. >> ./common.rc: line 220: 128894 Aborted (core dumped) ( >> VALGRIND_QEMU="${VALGRIND_QEMU_IO}" _qemu_proc_exec >> "${VALGRIND_LOGFILE}" "$QEMU_IO_PROG" $QEMU_IO_ARGS "$@" ) >> >> Looks like an L2 table is leaked somewhere. That’s why SCRI should be >> a g_auto()-able type. > > Forgot to add: This single test case here is the only place where we > test the added functionality. I think there should be more cases. It > doesn’t really make sense now that 271 has so many cases for writing > zeroes, but so few for discarding, now that discarding works on > subclusters. Most of them should at least be considered whether we can > run them for discard as well. > > I didn’t want to push for such an extensive set of tests, but, well, now > it turned out I overlooked a bug in patch 4, and only found it because I > thought “this place might also make a nice test case for this series”. > > Hanna > I agree that more coverage should be added. Based on the previous email, I see the following cases: 1. Direct 'discard' on the subclusters range (discard_l2_subclusters()). 2. Direct 'discard' on the subclusters range, complementary to an unallocated range (i.e. discard_l2_subclusters() -> discard_in_l2_slice()). 3. 'write -u -z' on the subclusters range (zero_l2_subclusters() -> discard_l2_subclusters()). 4. 'write -u -z' on the subclusters range, complementary to an unallocated range (zero_l2_subclusters() -> discard_l2_subclusters() -> discard_in_l2_slice()). Would also be nice to test the zero_l2_subclusters() -> zero_in_l2_slice() path, but we'd have to somehow check the refcount table for that since the L2 bitmap doesn't change. Please let me know if you can think of anything else.
Re: [PATCH 6/7] iotests/common.rc: add disk_usage function
On 11/3/23 17:20, Hanna Czenczek wrote: > On 20.10.23 23:56, Andrey Drobyshev wrote: >> Move the definition from iotests/250 to common.rc. This is used to >> detect real disk usage of sparse files. In particular, we want to use >> it for checking subclusters-based discards. >> >> Signed-off-by: Andrey Drobyshev >> --- >> tests/qemu-iotests/250 | 5 - >> tests/qemu-iotests/common.rc | 6 ++ >> 2 files changed, 6 insertions(+), 5 deletions(-) >> >> diff --git a/tests/qemu-iotests/250 b/tests/qemu-iotests/250 >> index af48f83aba..c0a0dbc0ff 100755 >> --- a/tests/qemu-iotests/250 >> +++ b/tests/qemu-iotests/250 >> @@ -52,11 +52,6 @@ _unsupported_imgopts data_file >> # bdrv_co_truncate(bs->file) call in qcow2_co_truncate(), which >> might succeed >> # anyway. >> -disk_usage() >> -{ >> - du --block-size=1 $1 | awk '{print $1}' >> -} >> - >> size=2100M >> _make_test_img -o "cluster_size=1M,preallocation=metadata" $size >> diff --git a/tests/qemu-iotests/common.rc b/tests/qemu-iotests/common.rc >> index 95c12577dd..5d2ea26c7f 100644 >> --- a/tests/qemu-iotests/common.rc >> +++ b/tests/qemu-iotests/common.rc >> @@ -140,6 +140,12 @@ _optstr_add() >> fi >> } >> +# report real disk usage for sparse files >> +disk_usage() >> +{ >> + du --block-size=1 $1 | awk '{print $1}' > > Pre-existing, but since you’re touching this now: Can you please change > the $1 to "$1"? > Sure, will do in v2. > Hanna > > [...]
Re: [PATCH 2/7] qcow2: add get_sc_range_info() helper for working with subcluster ranges
Hello Hanna, Sorry for the delay and thanks for your thorough and detailed review. On 10/31/23 17:53, Hanna Czenczek wrote: > On 20.10.23 23:56, Andrey Drobyshev wrote: >> This helper simply obtains the l2 table parameters of the cluster which >> contains the given subclusters range. Right now this info is being >> obtained and used by zero_l2_subclusters(). As we're about to introduce >> the subclusters discard operation, this helper would let us avoid code >> duplication. >> >> Also introduce struct SubClusterRangeInfo, which would contain all the >> needed params. >> >> Signed-off-by: Andrey Drobyshev >> --- >> block/qcow2-cluster.c | 90 +-- >> 1 file changed, 62 insertions(+), 28 deletions(-) >> >> diff --git a/block/qcow2-cluster.c b/block/qcow2-cluster.c >> index 904f00d1b3..8801856b93 100644 >> --- a/block/qcow2-cluster.c >> +++ b/block/qcow2-cluster.c >> @@ -32,6 +32,13 @@ >> #include "qemu/memalign.h" >> #include "trace.h" >> +typedef struct SubClusterRangeInfo { >> + uint64_t *l2_slice; > > We should document that this is a strong reference that must be returned > via qcow2_cache_put(). Maybe you could also define a clean-up function > using G_DEFINE_AUTO_CLEANUP_CLEAR_FUNC() that does this, allowing this > type to be used with g_auto(). > >> + int l2_index; >> + uint64_t l2_entry; >> + uint64_t l2_bitmap; >> +} SubClusterRangeInfo; >> + >> int coroutine_fn qcow2_shrink_l1_table(BlockDriverState *bs, >> uint64_t exact_size) >> { >> @@ -1892,6 +1899,50 @@ again: >> return 0; >> } >> +static int get_sc_range_info(BlockDriverState *bs, uint64_t offset, >> + unsigned nb_subclusters, >> + SubClusterRangeInfo *scri) > > It would be good to have documentation for this function, for example > that it only works on a single cluster, i.e. that the range denoted by > @offset and @nb_subclusters must not cross a cluster boundary, and that > @offset must be aligned to subclusters. > I figured out those restrictions are derived from the number of asserts in the function body itself, much like it's done in zero_l2_subclusters() and friends. But of course I don't mind documenting this. > In general, it is unclear to me at this point what this function does. The sole purpose of this function is to avoid the code duplication, since both zero_l2_subclusters() (pre-existing) and discard_l2_subclusters() (newly introduced) need to obtain the same info about the subclusters range they're working with. > OK, it gets the SCRI for all subclusters in the cluster at @offset (this > is what its name implies), but then it also has this loop that checks > whether there are compressed clusters among the @nb_subclusters. Look at the pre-existing implementation of zero_l2_subclusters(): it also checks that the subcluster range we're dealing with isn't contained within a compressed cluster (otherwise there's no point zeroizing individual subclusters). So the logic isn't changed here. The only reason I decided to loop through the subclusters (instead of calling qcow2_get_cluster_type() for the whole cluster) is so that I could detect invalid subclusters and return -EINVAL right away. > It has a comment about being unable to zero/discard subclusters in compressed > clusters, but the function name says nothing about this scope of > zeroing/discarding. > Maybe rename it then to stress out that we're dealing with the regular subclusters only? get_normal_sc_range_info()? >> +{ >> + BDRVQcow2State *s = bs->opaque; >> + int ret, sc_cleared = 0, sc_index = offset_to_sc_index(s, offset); >> + QCow2SubclusterType sctype; >> + >> + /* Here we only work with the subclusters within single cluster. */ >> + assert(nb_subclusters > 0 && nb_subclusters < >> s->subclusters_per_cluster); >> + assert(sc_index + nb_subclusters <= s->subclusters_per_cluster); >> + assert(offset_into_subcluster(s, offset) == 0); >> + >> + ret = get_cluster_table(bs, offset, >l2_slice, >> >l2_index); >> + if (ret < 0) { >> + return ret; >> + } >> + >> + scri->l2_entry = get_l2_entry(s, scri->l2_slice, scri->l2_index); >> + scri->l2_bitmap = get_l2_bitmap(s, scri->l2_slice, scri->l2_index); >> + >> + do { >> + qcow2_get_subcluster_range_type(bs, scri->l2_entry, >> scri->l2_bitmap, >> +
Re: [PATCH v3 1/8] qemu-img: rebase: stop when reaching EOF of old backing file
On 11/1/23 11:50, Michael Tokarev wrote: > 19.09.2023 19:57, Andrey Drobyshev via wrote: >> In case when we're rebasing within one backing chain, and when target >> image >> is larger than old backing file, bdrv_is_allocated_above() ends up >> setting >> *pnum = 0. As a result, target offset isn't getting incremented, and we >> get stuck in an infinite for loop. Let's detect this case and proceed >> further down the loop body, as the offsets beyond the old backing size >> need >> to be explicitly zeroed. >> >> Signed-off-by: Andrey Drobyshev >> Reviewed-by: Denis V. Lunev >> Reviewed-by: Hanna Czenczek > > Is this change not for -stable anymore? First version of this patch has > been > Cc'd to stable, this one is not. > > Thanks, > > /mjt Hi Michael, Since this series is already merged in master, I'm not sure whether it's necessary to forward this particular patch to qemu-stable, or it should rather be cherry-picked to -stable by one of the block maintainers. Kevin, maybe you can clarify? Andrey
[PATCH] kvm: emit GUEST_PANICKED event in case of abnormal KVM exit
Currently we emit GUEST_PANICKED event in case kvm_vcpu_ioctl() returns KVM_EXIT_SYSTEM_EVENT with the event type KVM_SYSTEM_EVENT_CRASH. Let's extend this scenario and emit GUEST_PANICKED in case of an abnormal KVM exit. That's a natural thing to do since in this case guest is no longer operational anyway. Signed-off-by: Andrey Drobyshev Acked-by: Denis V. Lunev --- accel/kvm/kvm-all.c | 19 +++ 1 file changed, 15 insertions(+), 4 deletions(-) diff --git a/accel/kvm/kvm-all.c b/accel/kvm/kvm-all.c index e39a810a4e..d74b3f0b0e 100644 --- a/accel/kvm/kvm-all.c +++ b/accel/kvm/kvm-all.c @@ -2816,6 +2816,14 @@ static void kvm_eat_signals(CPUState *cpu) } while (sigismember(, SIG_IPI)); } +static void kvm_emit_guest_crash(CPUState *cpu) +{ +kvm_cpu_synchronize_state(cpu); +qemu_mutex_lock_iothread(); +qemu_system_guest_panicked(cpu_get_crash_info(cpu)); +qemu_mutex_unlock_iothread(); +} + int kvm_cpu_exec(CPUState *cpu) { struct kvm_run *run = cpu->kvm_run; @@ -2969,21 +2977,24 @@ int kvm_cpu_exec(CPUState *cpu) ret = EXCP_INTERRUPT; break; case KVM_SYSTEM_EVENT_CRASH: -kvm_cpu_synchronize_state(cpu); -qemu_mutex_lock_iothread(); -qemu_system_guest_panicked(cpu_get_crash_info(cpu)); -qemu_mutex_unlock_iothread(); +kvm_emit_guest_crash(cpu); ret = 0; break; default: DPRINTF("kvm_arch_handle_exit\n"); ret = kvm_arch_handle_exit(cpu, run); +if (ret < 0) { +kvm_emit_guest_crash(cpu); +} break; } break; default: DPRINTF("kvm_arch_handle_exit\n"); ret = kvm_arch_handle_exit(cpu, run); +if (ret < 0) { +kvm_emit_guest_crash(cpu); +} break; } } while (ret == 0); -- 2.39.3
Re: [PATCH 1/6] qemu-img: rebase: stop when reaching EOF of old backing file
On 10/26/23 09:32, Michael Tokarev wrote: > 01.06.2023 22:28, Andrey Drobyshev via: >> In case when we're rebasing within one backing chain, and when target >> image >> is larger than old backing file, bdrv_is_allocated_above() ends up >> setting >> *pnum = 0. As a result, target offset isn't getting incremented, and we >> get stuck in an infinite for loop. Let's detect this case and proceed >> further down the loop body, as the offsets beyond the old backing size >> need >> to be explicitly zeroed. > > Ping? Has this been forgotten? It's a few months already.. > > /mjt Hi Michael, It's not forgotten, there's already v3 of this series and it's already taken to the block branch by Kevin: https://lists.nongnu.org/archive/html/qemu-block/2023-09/msg00593.html Andrey
[PATCH 7/7] iotests/271: check disk usage on subcluster-based discard/unmap
Add _verify_du_delta() checker which is used to check that real disk usage delta meets the expectations. For now we use it for checking that subcluster-based discard/unmap operations lead to actual disk usage decrease (i.e. PUNCH_HOLE operation is performed). Also add separate test case for discarding particular subcluster within one cluster. Signed-off-by: Andrey Drobyshev --- tests/qemu-iotests/271 | 25 - tests/qemu-iotests/271.out | 2 ++ 2 files changed, 26 insertions(+), 1 deletion(-) diff --git a/tests/qemu-iotests/271 b/tests/qemu-iotests/271 index c7c2cadda0..5fcb209f5f 100755 --- a/tests/qemu-iotests/271 +++ b/tests/qemu-iotests/271 @@ -81,6 +81,15 @@ _verify_l2_bitmap() fi } +# Check disk usage delta after a discard/unmap operation +# _verify_du_delta $before $after $expected_delta +_verify_du_delta() +{ +if [ $(($1 - $2)) -ne $3 ]; then +printf "ERROR: unexpected delta: $1 - $2 = $(($1 - $2)) != $3\n" +fi +} + # This should be called as _run_test c=XXX sc=XXX off=XXX len=XXX cmd=XXX # c: cluster number (0 if unset) # sc: subcluster number inside cluster @c (0 if unset) @@ -198,9 +207,12 @@ for use_backing_file in yes no; do alloc="$(seq 0 31)"; zero="" _run_test sc=0 len=64k -### Zero and unmap half of cluster #0 (this won't unmap it) +### Zero and unmap half of cluster #0 (this will unmap it) alloc="$(seq 16 31)"; zero="$(seq 0 15)" +before=$(disk_usage "$TEST_IMG") _run_test sc=0 len=32k cmd=unmap +after=$(disk_usage "$TEST_IMG") +_verify_du_delta $before $after 32768 ### Zero and unmap cluster #0 alloc=""; zero="$(seq 0 31)" @@ -447,7 +459,10 @@ for use_backing_file in yes no; do # Subcluster-aligned request from clusters #12 to #14 alloc="$(seq 0 15)"; zero="$(seq 16 31)" +before=$(disk_usage "$TEST_IMG") _run_test c=12 sc=16 len=128k cmd=unmap +after=$(disk_usage "$TEST_IMG") +_verify_du_delta $before $after $((128 * 1024)) alloc=""; zero="$(seq 0 31)" _verify_l2_bitmap 13 alloc="$(seq 16 31)"; zero="$(seq 0 15)" @@ -528,6 +543,14 @@ for use_backing_file in yes no; do else _make_test_img -o extended_l2=on 1M fi +# Write cluster #0 and discard its subclusters #0-#3 +$QEMU_IO -c 'write -q 0 64k' "$TEST_IMG" +before=$(disk_usage "$TEST_IMG") +$QEMU_IO -c 'discard -q 0 8k' "$TEST_IMG" +after=$(disk_usage "$TEST_IMG") +_verify_du_delta $before $after 8192 +alloc="$(seq 4 31)"; zero="$(seq 0 3)" +_verify_l2_bitmap 0 # Write clusters #0-#2 and then discard them $QEMU_IO -c 'write -q 0 128k' "$TEST_IMG" $QEMU_IO -c 'discard -q 0 128k' "$TEST_IMG" diff --git a/tests/qemu-iotests/271.out b/tests/qemu-iotests/271.out index 5be780de76..0da8d72cde 100644 --- a/tests/qemu-iotests/271.out +++ b/tests/qemu-iotests/271.out @@ -426,6 +426,7 @@ L2 entry #29: 0x ### Discarding clusters with non-zero bitmaps (backing file: yes) ### Formatting 'TEST_DIR/t.IMGFMT', fmt=IMGFMT size=1048576 backing_file=TEST_DIR/t.IMGFMT.base backing_fmt=raw +L2 entry #0: 0x8005 0000 L2 entry #0: 0x L2 entry #1: 0x Image resized. @@ -436,6 +437,7 @@ L2 entry #1: 0x ### Discarding clusters with non-zero bitmaps (backing file: no) ### Formatting 'TEST_DIR/t.IMGFMT', fmt=IMGFMT size=1048576 +L2 entry #0: 0x8005 0000 L2 entry #0: 0x L2 entry #1: 0x Image resized. -- 2.39.3
[PATCH 3/7] qcow2: zeroize the entire cluster when there're no non-zero subclusters
When zeroizing the last non-zero subclusters within single cluster, it makes sense to go zeroize the entire cluster and go down zero_in_l2_slice() path right away. That way we'd also update the corresponding refcount table. Signed-off-by: Andrey Drobyshev --- block/qcow2-cluster.c | 18 +++--- 1 file changed, 15 insertions(+), 3 deletions(-) diff --git a/block/qcow2-cluster.c b/block/qcow2-cluster.c index 8801856b93..7c6fa5524c 100644 --- a/block/qcow2-cluster.c +++ b/block/qcow2-cluster.c @@ -2145,7 +2145,7 @@ zero_in_l2_slice(BlockDriverState *bs, uint64_t offset, static int coroutine_fn GRAPH_RDLOCK zero_l2_subclusters(BlockDriverState *bs, uint64_t offset, -unsigned nb_subclusters) +unsigned nb_subclusters, int flags) { BDRVQcow2State *s = bs->opaque; uint64_t new_l2_bitmap; @@ -2161,6 +2161,17 @@ zero_l2_subclusters(BlockDriverState *bs, uint64_t offset, new_l2_bitmap |= QCOW_OFLAG_SUB_ZERO_RANGE(sc, sc + nb_subclusters); new_l2_bitmap &= ~QCOW_OFLAG_SUB_ALLOC_RANGE(sc, sc + nb_subclusters); +/* + * If there're no non-zero subclusters left, we might as well zeroize + * the entire cluster. That way we'd also update the refcount table. + */ +if ((new_l2_bitmap & QCOW_L2_BITMAP_ALL_ZEROES) == +QCOW_L2_BITMAP_ALL_ZEROES) { +qcow2_cache_put(s->l2_table_cache, (void **) _slice); +return zero_in_l2_slice(bs, QEMU_ALIGN_DOWN(offset, s->cluster_size), +1, flags); +} + if (new_l2_bitmap != scri.l2_bitmap) { set_l2_bitmap(s, scri.l2_slice, scri.l2_index, new_l2_bitmap); qcow2_cache_entry_mark_dirty(s->l2_table_cache, scri.l2_slice); @@ -2221,7 +2232,7 @@ int coroutine_fn qcow2_subcluster_zeroize(BlockDriverState *bs, uint64_t offset, if (head) { ret = zero_l2_subclusters(bs, offset - head, - size_to_subclusters(s, head)); + size_to_subclusters(s, head), flags); if (ret < 0) { goto fail; } @@ -2242,7 +2253,8 @@ int coroutine_fn qcow2_subcluster_zeroize(BlockDriverState *bs, uint64_t offset, } if (tail) { -ret = zero_l2_subclusters(bs, end_offset, size_to_subclusters(s, tail)); +ret = zero_l2_subclusters(bs, end_offset, + size_to_subclusters(s, tail), flags); if (ret < 0) { goto fail; } -- 2.39.3
[PATCH 2/7] qcow2: add get_sc_range_info() helper for working with subcluster ranges
This helper simply obtains the l2 table parameters of the cluster which contains the given subclusters range. Right now this info is being obtained and used by zero_l2_subclusters(). As we're about to introduce the subclusters discard operation, this helper would let us avoid code duplication. Also introduce struct SubClusterRangeInfo, which would contain all the needed params. Signed-off-by: Andrey Drobyshev --- block/qcow2-cluster.c | 90 +-- 1 file changed, 62 insertions(+), 28 deletions(-) diff --git a/block/qcow2-cluster.c b/block/qcow2-cluster.c index 904f00d1b3..8801856b93 100644 --- a/block/qcow2-cluster.c +++ b/block/qcow2-cluster.c @@ -32,6 +32,13 @@ #include "qemu/memalign.h" #include "trace.h" +typedef struct SubClusterRangeInfo { +uint64_t *l2_slice; +int l2_index; +uint64_t l2_entry; +uint64_t l2_bitmap; +} SubClusterRangeInfo; + int coroutine_fn qcow2_shrink_l1_table(BlockDriverState *bs, uint64_t exact_size) { @@ -1892,6 +1899,50 @@ again: return 0; } +static int get_sc_range_info(BlockDriverState *bs, uint64_t offset, + unsigned nb_subclusters, + SubClusterRangeInfo *scri) +{ +BDRVQcow2State *s = bs->opaque; +int ret, sc_cleared = 0, sc_index = offset_to_sc_index(s, offset); +QCow2SubclusterType sctype; + +/* Here we only work with the subclusters within single cluster. */ +assert(nb_subclusters > 0 && nb_subclusters < s->subclusters_per_cluster); +assert(sc_index + nb_subclusters <= s->subclusters_per_cluster); +assert(offset_into_subcluster(s, offset) == 0); + +ret = get_cluster_table(bs, offset, >l2_slice, >l2_index); +if (ret < 0) { +return ret; +} + +scri->l2_entry = get_l2_entry(s, scri->l2_slice, scri->l2_index); +scri->l2_bitmap = get_l2_bitmap(s, scri->l2_slice, scri->l2_index); + +do { +qcow2_get_subcluster_range_type(bs, scri->l2_entry, scri->l2_bitmap, +sc_index, ); +if (ret < 0) { +return ret; +} + +switch (sctype) { +case QCOW2_SUBCLUSTER_COMPRESSED: +/* We cannot partially zeroize/discard compressed clusters. */ +return -ENOTSUP; +case QCOW2_SUBCLUSTER_INVALID: +return -EINVAL; +default: +break; +} + +sc_cleared += ret; +} while (sc_cleared < nb_subclusters); + +return 0; +} + /* * This discards as many clusters of nb_clusters as possible at once (i.e. * all clusters in the same L2 slice) and returns the number of discarded @@ -2097,44 +2148,27 @@ zero_l2_subclusters(BlockDriverState *bs, uint64_t offset, unsigned nb_subclusters) { BDRVQcow2State *s = bs->opaque; -uint64_t *l2_slice; -uint64_t old_l2_bitmap, l2_bitmap; -int l2_index, ret, sc = offset_to_sc_index(s, offset); +uint64_t new_l2_bitmap; +int ret, sc = offset_to_sc_index(s, offset); +SubClusterRangeInfo scri = { 0 }; -/* For full clusters use zero_in_l2_slice() instead */ -assert(nb_subclusters > 0 && nb_subclusters < s->subclusters_per_cluster); -assert(sc + nb_subclusters <= s->subclusters_per_cluster); -assert(offset_into_subcluster(s, offset) == 0); - -ret = get_cluster_table(bs, offset, _slice, _index); +ret = get_sc_range_info(bs, offset, nb_subclusters, ); if (ret < 0) { -return ret; -} - -switch (qcow2_get_cluster_type(bs, get_l2_entry(s, l2_slice, l2_index))) { -case QCOW2_CLUSTER_COMPRESSED: -ret = -ENOTSUP; /* We cannot partially zeroize compressed clusters */ goto out; -case QCOW2_CLUSTER_NORMAL: -case QCOW2_CLUSTER_UNALLOCATED: -break; -default: -g_assert_not_reached(); } -old_l2_bitmap = l2_bitmap = get_l2_bitmap(s, l2_slice, l2_index); - -l2_bitmap |= QCOW_OFLAG_SUB_ZERO_RANGE(sc, sc + nb_subclusters); -l2_bitmap &= ~QCOW_OFLAG_SUB_ALLOC_RANGE(sc, sc + nb_subclusters); +new_l2_bitmap = scri.l2_bitmap; +new_l2_bitmap |= QCOW_OFLAG_SUB_ZERO_RANGE(sc, sc + nb_subclusters); +new_l2_bitmap &= ~QCOW_OFLAG_SUB_ALLOC_RANGE(sc, sc + nb_subclusters); -if (old_l2_bitmap != l2_bitmap) { -set_l2_bitmap(s, l2_slice, l2_index, l2_bitmap); -qcow2_cache_entry_mark_dirty(s->l2_table_cache, l2_slice); +if (new_l2_bitmap != scri.l2_bitmap) { +set_l2_bitmap(s, scri.l2_slice, scri.l2_index, new_l2_bitmap); +qcow2_cache_entry_mark_dirty(s->l2_table_cache, scri.l2_slice); } ret = 0; out: -qcow2_cache_put(s->l2_table_cache, (void **) _slice); +qcow2_cache_put(s->l2_table_cache, (void **) _slice); return ret; } -- 2.39.3
[PATCH 5/7] qcow2: zero_l2_subclusters: fall through to discard operation when requested
When zeroizing subclusters within single cluster, detect usage of the BDRV_REQ_MAY_UNMAP flag and fall through to the subcluster-based discard operation, much like it's done with the cluster-based discards. That way subcluster-aligned operations "qemu-io -c 'write -z -u ...'" will lead to actual unmap. Signed-off-by: Andrey Drobyshev --- block/qcow2-cluster.c | 17 ++--- 1 file changed, 14 insertions(+), 3 deletions(-) diff --git a/block/qcow2-cluster.c b/block/qcow2-cluster.c index cf40f2dc12..040251f2c3 100644 --- a/block/qcow2-cluster.c +++ b/block/qcow2-cluster.c @@ -2242,7 +2242,7 @@ zero_l2_subclusters(BlockDriverState *bs, uint64_t offset, unsigned nb_subclusters, int flags) { BDRVQcow2State *s = bs->opaque; -uint64_t new_l2_bitmap; +uint64_t new_l2_bitmap, l2_bitmap_mask; int ret, sc = offset_to_sc_index(s, offset); SubClusterRangeInfo scri = { 0 }; @@ -2251,9 +2251,10 @@ zero_l2_subclusters(BlockDriverState *bs, uint64_t offset, goto out; } +l2_bitmap_mask = QCOW_OFLAG_SUB_ALLOC_RANGE(sc, sc + nb_subclusters); new_l2_bitmap = scri.l2_bitmap; -new_l2_bitmap |= QCOW_OFLAG_SUB_ZERO_RANGE(sc, sc + nb_subclusters); -new_l2_bitmap &= ~QCOW_OFLAG_SUB_ALLOC_RANGE(sc, sc + nb_subclusters); +new_l2_bitmap |= QCOW_OFLAG_SUB_ZERO_RANGE(sc, sc + nb_subclusters); +new_l2_bitmap &= ~l2_bitmap_mask; /* * If there're no non-zero subclusters left, we might as well zeroize @@ -2266,6 +2267,16 @@ zero_l2_subclusters(BlockDriverState *bs, uint64_t offset, 1, flags); } +/* + * If the request allows discarding subclusters and they're actually + * allocated, we go down the discard path since after the discard + * operation the subclusters are going to be read as zeroes anyway. + */ +if ((flags & BDRV_REQ_MAY_UNMAP) && (scri.l2_bitmap & l2_bitmap_mask)) { +return discard_l2_subclusters(bs, offset, nb_subclusters, + QCOW2_DISCARD_REQUEST, false, ); +} + if (new_l2_bitmap != scri.l2_bitmap) { set_l2_bitmap(s, scri.l2_slice, scri.l2_index, new_l2_bitmap); qcow2_cache_entry_mark_dirty(s->l2_table_cache, scri.l2_slice); -- 2.39.3
[PATCH 0/7] qcow2: make subclusters discardable
This series extends the discard operation for qcow2 driver so that it can be used to discard individial subclusters. This feature might prove useful and gain some host disk space when dealing with the guest TRIM/discard requests, especially with large clusters (1M, 2M) when subclusters are enabled. Andrey Drobyshev (7): qcow2: make function update_refcount_discard() global qcow2: add get_sc_range_info() helper for working with subcluster ranges qcow2: zeroize the entire cluster when there're no non-zero subclusters qcow2: make subclusters discardable qcow2: zero_l2_subclusters: fall through to discard operation when requested iotests/common.rc: add disk_usage function iotests/271: check disk usage on subcluster-based discard/unmap block/qcow2-cluster.c| 217 +-- block/qcow2-refcount.c | 8 +- block/qcow2.c| 8 +- block/qcow2.h| 2 + tests/qemu-iotests/250 | 5 - tests/qemu-iotests/271 | 25 +++- tests/qemu-iotests/271.out | 2 + tests/qemu-iotests/common.rc | 6 + 8 files changed, 226 insertions(+), 47 deletions(-) -- 2.39.3
[PATCH 4/7] qcow2: make subclusters discardable
This commit makes the discard operation work on the subcluster level rather than cluster level. It introduces discard_l2_subclusters() function and makes use of it in qcow2 discard implementation, much like it's done with zero_in_l2_slice() / zero_l2_subclusters(). It also changes the qcow2 driver pdiscard_alignment to subcluster_size. That way subcluster-aligned discards lead to actual fallocate(PUNCH_HOLE) operation and free host disk space. This feature will let us gain additional disk space on guest TRIM/discard requests, especially when using large enough clusters (1M, 2M) with subclusters enabled. Signed-off-by: Andrey Drobyshev --- block/qcow2-cluster.c | 100 -- block/qcow2.c | 8 ++-- 2 files changed, 101 insertions(+), 7 deletions(-) diff --git a/block/qcow2-cluster.c b/block/qcow2-cluster.c index 7c6fa5524c..cf40f2dc12 100644 --- a/block/qcow2-cluster.c +++ b/block/qcow2-cluster.c @@ -2042,6 +2042,74 @@ discard_in_l2_slice(BlockDriverState *bs, uint64_t offset, uint64_t nb_clusters, return nb_clusters; } +static int coroutine_fn GRAPH_RDLOCK +discard_l2_subclusters(BlockDriverState *bs, uint64_t offset, + uint64_t nb_subclusters, + enum qcow2_discard_type type, + bool full_discard, + SubClusterRangeInfo *pscri) +{ +BDRVQcow2State *s = bs->opaque; +uint64_t new_l2_bitmap, l2_bitmap_mask; +int ret, sc = offset_to_sc_index(s, offset); +SubClusterRangeInfo scri = { 0 }; + +if (!pscri) { +ret = get_sc_range_info(bs, offset, nb_subclusters, ); +if (ret < 0) { +goto out; +} +} else { +scri = *pscri; +} + +l2_bitmap_mask = QCOW_OFLAG_SUB_ALLOC_RANGE(sc, sc + nb_subclusters); +new_l2_bitmap = scri.l2_bitmap; +new_l2_bitmap &= ~l2_bitmap_mask; + +/* + * If there're no allocated subclusters left, we might as well discard + * the entire cluster. That way we'd also update the refcount table. + */ +if (!(new_l2_bitmap & QCOW_L2_BITMAP_ALL_ALLOC)) { +return discard_in_l2_slice(bs, + QEMU_ALIGN_DOWN(offset, s->cluster_size), + 1, type, full_discard); +} + +/* + * Full discard means we fall through to the backing file, thus we only + * need to mark the subclusters as deallocated. + * + * Non-full discard means subclusters should be explicitly marked as + * zeroes. In this case QCOW2 specification requires the corresponding + * allocation status bits to be unset as well. If the subclusters are + * deallocated in the first place and there's no backing, the operation + * can be skipped. + */ +if (!full_discard && +(bs->backing || scri.l2_bitmap & l2_bitmap_mask)) { +new_l2_bitmap |= QCOW_OFLAG_SUB_ZERO_RANGE(sc, sc + nb_subclusters); +} + +if (scri.l2_bitmap != new_l2_bitmap) { +set_l2_bitmap(s, scri.l2_slice, scri.l2_index, new_l2_bitmap); +qcow2_cache_entry_mark_dirty(s->l2_table_cache, scri.l2_slice); +} + +if (s->discard_passthrough[type]) { +qcow2_queue_discard(bs, (scri.l2_entry & L2E_OFFSET_MASK) + +offset_into_cluster(s, offset), +nb_subclusters * s->subcluster_size); +} + +ret = 0; +out: +qcow2_cache_put(s->l2_table_cache, (void **) _slice); + +return ret; +} + int qcow2_cluster_discard(BlockDriverState *bs, uint64_t offset, uint64_t bytes, enum qcow2_discard_type type, bool full_discard) @@ -2049,19 +2117,36 @@ int qcow2_cluster_discard(BlockDriverState *bs, uint64_t offset, BDRVQcow2State *s = bs->opaque; uint64_t end_offset = offset + bytes; uint64_t nb_clusters; +unsigned head, tail; int64_t cleared; int ret; /* Caller must pass aligned values, except at image end */ -assert(QEMU_IS_ALIGNED(offset, s->cluster_size)); -assert(QEMU_IS_ALIGNED(end_offset, s->cluster_size) || +assert(QEMU_IS_ALIGNED(offset, s->subcluster_size)); +assert(QEMU_IS_ALIGNED(end_offset, s->subcluster_size) || end_offset == bs->total_sectors << BDRV_SECTOR_BITS); -nb_clusters = size_to_clusters(s, bytes); +head = MIN(end_offset, ROUND_UP(offset, s->cluster_size)) - offset; +offset += head; + +tail = (end_offset >= bs->total_sectors << BDRV_SECTOR_BITS) ? 0 : + end_offset - MAX(offset, start_of_cluster(s, end_offset)); +end_offset -= tail; s->cache_discards = true; +if (head) { +ret = discard_l2_subclusters(bs, offset - head, + size_to_subclusters(s, head), type, + full_discard, NULL); +i
[PATCH 1/7] qcow2: make function update_refcount_discard() global
We are going to need it for discarding separate subclusters. The function itself doesn't do anything with the refcount tables, it simply adds a discard request to the queue, so rename it to qcow2_queue_discard(). Signed-off-by: Andrey Drobyshev --- block/qcow2-refcount.c | 8 block/qcow2.h | 2 ++ 2 files changed, 6 insertions(+), 4 deletions(-) diff --git a/block/qcow2-refcount.c b/block/qcow2-refcount.c index 0266542cee..2026f5fa21 100644 --- a/block/qcow2-refcount.c +++ b/block/qcow2-refcount.c @@ -754,8 +754,8 @@ void qcow2_process_discards(BlockDriverState *bs, int ret) } } -static void update_refcount_discard(BlockDriverState *bs, -uint64_t offset, uint64_t length) +void qcow2_queue_discard(BlockDriverState *bs, uint64_t offset, + uint64_t length) { BDRVQcow2State *s = bs->opaque; Qcow2DiscardRegion *d, *p, *next; @@ -902,7 +902,7 @@ update_refcount(BlockDriverState *bs, int64_t offset, int64_t length, } if (s->discard_passthrough[type]) { -update_refcount_discard(bs, cluster_offset, s->cluster_size); +qcow2_queue_discard(bs, cluster_offset, s->cluster_size); } } } @@ -3619,7 +3619,7 @@ qcow2_discard_refcount_block(BlockDriverState *bs, uint64_t discard_block_offs) /* discard refblock from the cache if refblock is cached */ qcow2_cache_discard(s->refcount_block_cache, refblock); } -update_refcount_discard(bs, discard_block_offs, s->cluster_size); +qcow2_queue_discard(bs, discard_block_offs, s->cluster_size); return 0; } diff --git a/block/qcow2.h b/block/qcow2.h index 29958c512b..75d6a1b61b 100644 --- a/block/qcow2.h +++ b/block/qcow2.h @@ -891,6 +891,8 @@ int coroutine_fn qcow2_check_refcounts(BlockDriverState *bs, BdrvCheckResult *re BdrvCheckMode fix); void qcow2_process_discards(BlockDriverState *bs, int ret); +void qcow2_queue_discard(BlockDriverState *bs, uint64_t offset, + uint64_t length); int qcow2_check_metadata_overlap(BlockDriverState *bs, int ign, int64_t offset, int64_t size); -- 2.39.3
[PATCH 6/7] iotests/common.rc: add disk_usage function
Move the definition from iotests/250 to common.rc. This is used to detect real disk usage of sparse files. In particular, we want to use it for checking subclusters-based discards. Signed-off-by: Andrey Drobyshev --- tests/qemu-iotests/250 | 5 - tests/qemu-iotests/common.rc | 6 ++ 2 files changed, 6 insertions(+), 5 deletions(-) diff --git a/tests/qemu-iotests/250 b/tests/qemu-iotests/250 index af48f83aba..c0a0dbc0ff 100755 --- a/tests/qemu-iotests/250 +++ b/tests/qemu-iotests/250 @@ -52,11 +52,6 @@ _unsupported_imgopts data_file # bdrv_co_truncate(bs->file) call in qcow2_co_truncate(), which might succeed # anyway. -disk_usage() -{ -du --block-size=1 $1 | awk '{print $1}' -} - size=2100M _make_test_img -o "cluster_size=1M,preallocation=metadata" $size diff --git a/tests/qemu-iotests/common.rc b/tests/qemu-iotests/common.rc index 95c12577dd..5d2ea26c7f 100644 --- a/tests/qemu-iotests/common.rc +++ b/tests/qemu-iotests/common.rc @@ -140,6 +140,12 @@ _optstr_add() fi } +# report real disk usage for sparse files +disk_usage() +{ +du --block-size=1 $1 | awk '{print $1}' +} + # Set the variables to the empty string to turn Valgrind off # for specific processes, e.g. # $ VALGRIND_QEMU_IO= ./check -qcow2 -valgrind 015 -- 2.39.3
Re: [PATCH v3 0/8] qemu-img: rebase: add compression support
On 10/2/23 09:35, Andrey Drobyshev wrote: > On 9/19/23 20:57, Andrey Drobyshev wrote: >> v2 --> v3: >> * Patch 3/8: fixed logic in the if statement, so that we align on blk >>when blk_old_backing == NULL; >> * Patch 4/8: comment fix; >> * Patch 5/8: comment fix; dropped redundant "if (blk_new_backing)" >>statements. >> >> v2: https://lists.nongnu.org/archive/html/qemu-block/2023-09/msg00448.html >> >> Andrey Drobyshev (8): >> qemu-img: rebase: stop when reaching EOF of old backing file >> qemu-iotests: 024: add rebasing test case for overlay_size > >> backing_size >> qemu-img: rebase: use backing files' BlockBackend for buffer alignment >> qemu-img: add chunk size parameter to compare_buffers() >> qemu-img: rebase: avoid unnecessary COW operations >> iotests/{024, 271}: add testcases for qemu-img rebase >> qemu-img: add compression option to rebase subcommand >> iotests: add tests for "qemu-img rebase" with compression >> >> docs/tools/qemu-img.rst| 6 +- >> qemu-img-cmds.hx | 4 +- >> qemu-img.c | 136 ++ >> tests/qemu-iotests/024 | 117 ++ >> tests/qemu-iotests/024.out | 73 >> tests/qemu-iotests/271 | 131 + >> tests/qemu-iotests/271.out | 82 ++ >> tests/qemu-iotests/314 | 165 + >> tests/qemu-iotests/314.out | 75 + >> 9 files changed, 752 insertions(+), 37 deletions(-) >> create mode 100755 tests/qemu-iotests/314 >> create mode 100644 tests/qemu-iotests/314.out >> > > Ping Friendly ping
Re: [PATCH v3 0/8] qemu-img: rebase: add compression support
On 9/19/23 20:57, Andrey Drobyshev wrote: > v2 --> v3: > * Patch 3/8: fixed logic in the if statement, so that we align on blk >when blk_old_backing == NULL; > * Patch 4/8: comment fix; > * Patch 5/8: comment fix; dropped redundant "if (blk_new_backing)" >statements. > > v2: https://lists.nongnu.org/archive/html/qemu-block/2023-09/msg00448.html > > Andrey Drobyshev (8): > qemu-img: rebase: stop when reaching EOF of old backing file > qemu-iotests: 024: add rebasing test case for overlay_size > > backing_size > qemu-img: rebase: use backing files' BlockBackend for buffer alignment > qemu-img: add chunk size parameter to compare_buffers() > qemu-img: rebase: avoid unnecessary COW operations > iotests/{024, 271}: add testcases for qemu-img rebase > qemu-img: add compression option to rebase subcommand > iotests: add tests for "qemu-img rebase" with compression > > docs/tools/qemu-img.rst| 6 +- > qemu-img-cmds.hx | 4 +- > qemu-img.c | 136 ++ > tests/qemu-iotests/024 | 117 ++ > tests/qemu-iotests/024.out | 73 > tests/qemu-iotests/271 | 131 + > tests/qemu-iotests/271.out | 82 ++ > tests/qemu-iotests/314 | 165 + > tests/qemu-iotests/314.out | 75 + > 9 files changed, 752 insertions(+), 37 deletions(-) > create mode 100755 tests/qemu-iotests/314 > create mode 100644 tests/qemu-iotests/314.out > Ping
Re: [PATCH] maint: Tweak comment in mailmap to sound friendlier
On 9/26/23 23:39, ebl...@redhat.com wrote: > From: Eric Blake > > Documenting that we should not add new lines to work around SPF > rewrites sounds foreboding; the intent is instead that new lines here > are okay, but indicate a second problem elsewhere in our build process > that we should also consider fixing at the same time, to keep the > section from growing without bounds. > > Mentioning DMARC alongside SPF may also help people grep for this > scenario, as well as documenting the 'git config' workaround that can > be used by submitters to avoid the munging issue in the first place. > > Fixes: 3bd2608d ("maint: Add .mailmap entries for patches claiming list > authorship") > CC: Andrey Drobyshev > Cc: Peter Maydell > Signed-off-by: Eric Blake > --- > > I'm sending this email with a temporary 'git config sendemail.from > ebl...@redhat.com', to see if the added advice actually adds the extra > line. It did not show up in my editor window, though, so this patch > may need further tweaking to get the instructions right. Since I > don't normally suffer from SPF/DMARC munging, I may not be the best > person to test the right workaround. Or maybe 'git config' does not > yet have the right workaround already available as a turnkey solution > we can suggest. > The only drawback of this approach is that mail clients, as well as tools like patchew.org now only show your "" in From/Author field. You can see it here: https://patchew.org/search?q=project%3AQEMU+from%3ABlake In your email there're 2 "From:" fields now: > Headers... > From: ebl...@redhat.com > More headers... > From: Eric Blake > Actual patch Apparently, mail clients prefer to pay attention on the very first "From:" entry, while tools like "git am" -- on the last. If we managed to make those entries both be in the format "name " -- that'd be ideal. However, as I pointed out in another thread, if we set sendemail.from to "name ", the 2nd entry doesn't get added since they're now identical. So you figure out the way to get 2 identical "From:" entries -- please let us know. Having all that said, it would still be nice to have additional checks for "qemu-bl...@nongnu.org" authorship, as Peter mentioned in the previous thread. Andrey
[PATCH] mailmap: Fix Andrey Drobyshev author email
From: Andrey Drobyshev This fixes authorship of commits 2848289168, 52b10c9c0c as the mailing list rewrote the "From:" field in the corresponding patches. See commit 3bd2608db7 ("maint: Add .mailmap entries for patches claiming list authorship") for explanation. Signed-off-by: Andrey Drobyshev --- .mailmap | 1 + 1 file changed, 1 insertion(+) diff --git a/.mailmap b/.mailmap index 64ef9f4de6..04a7feb005 100644 --- a/.mailmap +++ b/.mailmap @@ -46,6 +46,7 @@ Ian McKellar Ian McKellar via Qemu-devel Julia Suvorova via Qemu-devel Justin Terry (VM) Justin Terry (VM) via Qemu-devel Stefan Weil Stefan Weil via +Andrey Drobyshev Andrey Drobyshev via # Next, replace old addresses by a more recent one. Aleksandar Markovic -- 2.41.0
[PATCH v3 1/8] qemu-img: rebase: stop when reaching EOF of old backing file
In case when we're rebasing within one backing chain, and when target image is larger than old backing file, bdrv_is_allocated_above() ends up setting *pnum = 0. As a result, target offset isn't getting incremented, and we get stuck in an infinite for loop. Let's detect this case and proceed further down the loop body, as the offsets beyond the old backing size need to be explicitly zeroed. Signed-off-by: Andrey Drobyshev Reviewed-by: Denis V. Lunev Reviewed-by: Hanna Czenczek --- qemu-img.c | 13 - 1 file changed, 12 insertions(+), 1 deletion(-) diff --git a/qemu-img.c b/qemu-img.c index a48edb7101..50660ba920 100644 --- a/qemu-img.c +++ b/qemu-img.c @@ -3805,6 +3805,8 @@ static int img_rebase(int argc, char **argv) } if (prefix_chain_bs) { +uint64_t bytes = n; + /* * If cluster wasn't changed since prefix_chain, we don't need * to take action @@ -3817,9 +3819,18 @@ static int img_rebase(int argc, char **argv) strerror(-ret)); goto out; } -if (!ret) { +if (!ret && n) { continue; } +if (!n) { +/* + * If we've reached EOF of the old backing, it means that + * offsets beyond the old backing size were read as zeroes. + * Now we will need to explicitly zero the cluster in + * order to preserve that state after the rebase. + */ +n = bytes; +} } /* -- 2.39.3
[PATCH v3 5/8] qemu-img: rebase: avoid unnecessary COW operations
When rebasing an image from one backing file to another, we need to compare data from old and new backings. If the diff between that data happens to be unaligned to the target cluster size, we might end up doing partial writes, which would lead to copy-on-write and additional IO. Consider the following simple case (virtual_size == cluster_size == 64K): base <-- inc1 <-- inc2 qemu-io -c "write -P 0xaa 0 32K" base.qcow2 qemu-io -c "write -P 0xcc 32K 32K" base.qcow2 qemu-io -c "write -P 0xbb 0 32K" inc1.qcow2 qemu-io -c "write -P 0xcc 32K 32K" inc1.qcow2 qemu-img rebase -f qcow2 -b base.qcow2 -F qcow2 inc2.qcow2 While doing rebase, we'll write a half of the cluster to inc2, and block layer will have to read the 2nd half of the same cluster from the base image inc1 while doing this write operation, although the whole cluster is already read earlier to perform data comparison. In order to avoid these unnecessary IO cycles, let's make sure every write request is aligned to the overlay subcluster boundaries. Using subcluster size is universal as for the images which don't have them this size equals to the cluster size. so in any case we end up aligning to the smallest unit of allocation. Signed-off-by: Andrey Drobyshev --- qemu-img.c | 74 +++--- 1 file changed, 54 insertions(+), 20 deletions(-) diff --git a/qemu-img.c b/qemu-img.c index 0f67b021f7..a2d6241648 100644 --- a/qemu-img.c +++ b/qemu-img.c @@ -3523,6 +3523,7 @@ static int img_rebase(int argc, char **argv) uint8_t *buf_new = NULL; BlockDriverState *bs = NULL, *prefix_chain_bs = NULL; BlockDriverState *unfiltered_bs; +BlockDriverInfo bdi = {0}; char *filename; const char *fmt, *cache, *src_cache, *out_basefmt, *out_baseimg; int c, flags, src_flags, ret; @@ -3533,6 +3534,7 @@ static int img_rebase(int argc, char **argv) bool quiet = false; Error *local_err = NULL; bool image_opts = false; +int64_t write_align; /* Parse commandline parameters */ fmt = NULL; @@ -3656,6 +3658,20 @@ static int img_rebase(int argc, char **argv) } } +/* + * We need overlay subcluster size to make sure write requests are + * aligned. + */ +ret = bdrv_get_info(unfiltered_bs, ); +if (ret < 0) { +error_report("could not get block driver info"); +goto out; +} else if (bdi.subcluster_size == 0) { +bdi.subcluster_size = 1; +} + +write_align = bdi.subcluster_size; + /* For safe rebasing we need to compare old and new backing file */ if (!unsafe) { QDict *options = NULL; @@ -3753,7 +3769,7 @@ static int img_rebase(int argc, char **argv) int64_t old_backing_size = 0; int64_t new_backing_size = 0; uint64_t offset; -int64_t n; +int64_t n, n_old = 0, n_new = 0; float local_progress = 0; if (blk_old_backing && bdrv_opt_mem_align(blk_bs(blk_old_backing)) > @@ -3799,7 +3815,8 @@ static int img_rebase(int argc, char **argv) } for (offset = 0; offset < size; offset += n) { -bool buf_old_is_zero = false; +bool old_backing_eof = false; +int64_t n_alloc; /* How many bytes can we handle with the next read? */ n = MIN(IO_BUF_SIZE, size - offset); @@ -3844,33 +3861,46 @@ static int img_rebase(int argc, char **argv) } } +/* + * At this point we know that the region [offset; offset + n) + * is unallocated within the target image. This region might be + * unaligned to the target image's (sub)cluster boundaries, as + * old backing may have smaller clusters (or have subclusters). + * We extend it to the aligned boundaries to avoid CoW on + * partial writes in blk_pwrite(), + */ +n += offset - QEMU_ALIGN_DOWN(offset, write_align); +offset = QEMU_ALIGN_DOWN(offset, write_align); +n += QEMU_ALIGN_UP(offset + n, write_align) - (offset + n); +n = MIN(n, size - offset); +assert(!bdrv_is_allocated(unfiltered_bs, offset, n, _alloc) && + n_alloc == n); + +/* + * Much like with the target image, we'll try to read as much + * of the old and new backings as we can. + */ +n_old = MIN(n, MAX(0, old_backing_size - (int64_t) offset)); +n_new = MIN(n, MAX(0, new_backing_size - (int64_t) offset)); + /* * Read old and new backing file and take into consideration that * backing files may be smaller than the COW image. */ -if (offset >= old_backing_size) { -memset(buf_old, 0, n); -buf_old_is_zero