08.07.2020 15:07, Stefan Hajnoczi wrote:
On Sat, Jun 20, 2020 at 05:36:47PM +0300, Vladimir Sementsov-Ogievskiy wrote:It may be used for file-systems with slow allocation.Signed-off-by: Vladimir Sementsov-Ogievskiy <vsement...@virtuozzo.com> --- qapi/block-core.json | 3 +- block/preallocate.c | 255 +++++++++++++++++++++++++++++++++++++++++++ block/Makefile.objs | 1 + 3 files changed, 258 insertions(+), 1 deletion(-) create mode 100644 block/preallocate.cPlease add documentation to docs/system/qemu-block-drivers.rst.inc describing the purpose of this block driver and how to use it.
This implies adding new section "Filters", yes?
Since this filter grows the file I guess it's intended to be below an image format?
Yes, between format and protocol nodes.
diff --git a/qapi/block-core.json b/qapi/block-core.json index 0e1c6a59f2..a0bda399d6 100644 --- a/qapi/block-core.json +++ b/qapi/block-core.json @@ -2805,7 +2805,7 @@ 'cloop', 'compress', 'copy-on-read', 'dmg', 'file', 'ftp', 'ftps', 'gluster', 'host_cdrom', 'host_device', 'http', 'https', 'iscsi', 'luks', 'nbd', 'nfs', 'null-aio', 'null-co', 'nvme', 'parallels', - 'qcow', 'qcow2', 'qed', 'quorum', 'raw', 'rbd', + 'preallocate', 'qcow', 'qcow2', 'qed', 'quorum', 'raw', 'rbd', { 'name': 'replication', 'if': 'defined(CONFIG_REPLICATION)' }, 'sheepdog', 'ssh', 'throttle', 'vdi', 'vhdx', 'vmdk', 'vpc', 'vvfat', 'vxhs' ] } @@ -3995,6 +3995,7 @@ 'null-co': 'BlockdevOptionsNull', 'nvme': 'BlockdevOptionsNVMe', 'parallels': 'BlockdevOptionsGenericFormat', + 'preallocate':'BlockdevOptionsGenericFormat', 'qcow2': 'BlockdevOptionsQcow2', 'qcow': 'BlockdevOptionsQcow', 'qed': 'BlockdevOptionsGenericCOWFormat', diff --git a/block/preallocate.c b/block/preallocate.c new file mode 100644 index 0000000000..c272a6e41d --- /dev/null +++ b/block/preallocate.c @@ -0,0 +1,255 @@ +/* + * preallocate filter driver + * + * The driver performs preallocate operation: it is injected above + * some node, and before each write over EOF it does additional preallocating + * write-zeroes request. + * + * Copyright (c) 2020 Virtuozzo International GmbH. + * + * Author: + * Sementsov-Ogievskiy Vladimir <vsement...@virtuozzo.com> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program. If not, see <http://www.gnu.org/licenses/>. + */ + +#include "qemu/osdep.h" + +#include "qemu/module.h" +#include "qemu/units.h" +#include "block/block_int.h" + + +typedef struct BDRVPreallocateState { + int64_t prealloc_size; + int64_t prealloc_align; + + /* + * Track real data end, to crop preallocation on close data_end may be + * negative, which means that actual status is unknown (nothing cropped in + * this case) + */ + int64_t data_end; +} BDRVPreallocateState; + + +static int preallocate_open(BlockDriverState *bs, QDict *options, int flags, + Error **errp) +{ + BDRVPreallocateState *s = bs->opaque; + + /* + * Parameters are hardcoded now. May need to add corresponding options in + * future. + */The code for .bdrv_open() options is quick to write. If you add the options right away then it will be much easier for users who need to tweak them in the future.
OK
+ s->prealloc_align = 1 * MiB; + s->prealloc_size = 128 * MiB; + + bs->file = bdrv_open_child(NULL, options, "file", bs, &child_of_bds, + BDRV_CHILD_FILTERED | BDRV_CHILD_PRIMARY, + false, errp); + if (!bs->file) { + return -EINVAL; + } + + s->data_end = bdrv_getlength(bs->file->bs); + if (s->data_end < 0) { + return s->data_end; + } + + bs->supported_write_flags = BDRV_REQ_WRITE_UNCHANGED | + (BDRV_REQ_FUA & bs->file->bs->supported_write_flags); + + bs->supported_zero_flags = BDRV_REQ_WRITE_UNCHANGED | + ((BDRV_REQ_FUA | BDRV_REQ_MAY_UNMAP | BDRV_REQ_NO_FALLBACK) & + bs->file->bs->supported_zero_flags); + + return 0; +} + +static void preallocate_close(BlockDriverState *bs) +{ + BDRVPreallocateState *s = bs->opaque; + + if (s->data_end >= 0 && bdrv_getlength(bs->file->bs) > s->data_end) { + bdrv_truncate(bs->file, s->data_end, true, PREALLOC_MODE_OFF, 0, NULL); + } +} + +static void preallocate_child_perm(BlockDriverState *bs, BdrvChild *c, + BdrvChildRole role, + BlockReopenQueue *reopen_queue, + uint64_t perm, uint64_t shared, + uint64_t *nperm, uint64_t *nshared) +{ + bdrv_default_perms(bs, c, role, reopen_queue, perm, shared, nperm, nshared); + + /* Force RESIZE permission, to be able to crop file on close() */ + *nperm |= BLK_PERM_RESIZE; +} + +static coroutine_fn int preallocate_co_preadv_part( + BlockDriverState *bs, uint64_t offset, uint64_t bytes, + QEMUIOVector *qiov, size_t qiov_offset, int flags) +{ + return bdrv_co_preadv_part(bs->file, offset, bytes, qiov, qiov_offset, + flags); +} + +static int coroutine_fn preallocate_co_pdiscard(BlockDriverState *bs, + int64_t offset, int bytes) +{ + return bdrv_co_pdiscard(bs->file, offset, bytes); +} + +static bool coroutine_fn do_preallocate(BlockDriverState *bs, int64_t offset, + int64_t bytes, bool write_zero) +{ + BDRVPreallocateState *s = bs->opaque; + int64_t len, start, end; + BdrvTrackedRequest *lock; + int ret; + + if (s->data_end >= 0) { + s->data_end = MAX(s->data_end, + QEMU_ALIGN_UP(offset + bytes, BDRV_SECTOR_SIZE)); + } + + len = bdrv_getlength(bs->file->bs); + if (len < 0) { + return false; + } + + if (s->data_end < 0) { + s->data_end = MAX(len, + QEMU_ALIGN_UP(offset + bytes, BDRV_SECTOR_SIZE)); + } + + if (offset + bytes <= len) { + return false; + } + + lock = bdrv_co_range_try_lock(bs->file->bs, len, INT64_MAX - len); + if (!lock) { + /* There are already preallocating requests in-fligth */s/fligth/flight/+ return false; + } + + /* Length should not have changed */ + assert(len == bdrv_getlength(bs->file->bs)); + + start = write_zero ? MIN(offset, len) : len; + end = QEMU_ALIGN_UP(offset + bytes + s->prealloc_size, s->prealloc_align); + + ret = bdrv_co_pwrite_zeroes_locked(bs->file, start, end - start, + BDRV_REQ_NO_FALLBACK, lock); + + bdrv_co_range_unlock(lock);Hmm...if this piece of code is the only user of bdrv_co_range_try_lock() then a BDRV_REQ_NO_WAIT flag might be a simpler API. I thought the lock request would be used to perform multiple operations, but if it's just for a single operation then I think it's less code and easier to understand without the lock request.
Hmm, again, I don't remember exact reasons. Firstly, I was afraid of length change during try_lock and have a double check for bdrv_getlength(). Then I decided that it's impossible and change the check to an assertion. Probably, the only reason to leave locked range was "I already have the code, it will help with copy-on-read, why not to use it".. OK, I'll try rewrite it with help of new flag.
+ + return !ret; +} + +static int coroutine_fn preallocate_co_pwrite_zeroes(BlockDriverState *bs, + int64_t offset, int bytes, BdrvRequestFlags flags) +{ + if (do_preallocate(bs, offset, bytes, true)) { + return 0; + } + + return bdrv_co_pwrite_zeroes(bs->file, offset, bytes, flags); +} + +static coroutine_fn int preallocate_co_pwritev_part(BlockDriverState *bs, + uint64_t offset, + uint64_t bytes, + QEMUIOVector *qiov, + size_t qiov_offset, + int flags) +{ + do_preallocate(bs, offset, bytes, false); + + return bdrv_co_pwritev_part(bs->file, offset, bytes, qiov, qiov_offset, + flags); +} + +static int coroutine_fn +preallocate_co_truncate(BlockDriverState *bs, int64_t offset, + bool exact, PreallocMode prealloc, + BdrvRequestFlags flags, Error **errp) +{ + BDRVPreallocateState *s = bs->opaque; + int ret = bdrv_co_truncate(bs->file, offset, exact, prealloc, flags, errp); + + /* s->data_end may become negative here, which means unknown data end */ + s->data_end = bdrv_getlength(bs->file->bs); + + return ret; +} + +static int coroutine_fn preallocate_co_flush(BlockDriverState *bs) +{ + if (!bs->file) { + return 0; + }When does this happen? It's surprising to see the !bs->file check here but not in other functions.
It's just done line in mirror-top and backup-top.. But seems there should not be such an issue. Will drop.
+ + return bdrv_co_flush(bs->file->bs); +} + +static int64_t preallocate_getlength(BlockDriverState *bs) +{ + /* + * We probably can return s->data_end here, but seems safer to return real + * file length, not trying to hide the preallocation. + * + * Still, don't miss the chance to restore s->data_end if it is broken. + */ + BDRVPreallocateState *s = bs->opaque; + int64_t ret = bdrv_getlength(bs->file->bs); + + if (s->data_end < 0) { + s->data_end = ret; + } + + return ret; +} + +BlockDriver bdrv_preallocate_filter = { + .format_name = "preallocate", + .instance_size = sizeof(BDRVPreallocateState), + + .bdrv_getlength = preallocate_getlength, + .bdrv_open = preallocate_open, + .bdrv_close = preallocate_close, + + .bdrv_co_preadv_part = preallocate_co_preadv_part, + .bdrv_co_pwritev_part = preallocate_co_pwritev_part, + .bdrv_co_pwrite_zeroes = preallocate_co_pwrite_zeroes, + .bdrv_co_pdiscard = preallocate_co_pdiscard, + .bdrv_co_flush = preallocate_co_flush, + .bdrv_co_truncate = preallocate_co_truncate, + + .bdrv_co_block_status = bdrv_co_block_status_from_file, + + .bdrv_child_perm = preallocate_child_perm, + + .has_variable_length = true, + .is_filter = true, +}; + +static void bdrv_preallocate_init(void) +{ + bdrv_register(&bdrv_preallocate_filter); +} + +block_init(bdrv_preallocate_init); diff --git a/block/Makefile.objs b/block/Makefile.objs index 3635b6b4c1..f46a353a35 100644 --- a/block/Makefile.objs +++ b/block/Makefile.objs @@ -45,6 +45,7 @@ block-obj-y += crypto.o block-obj-y += aio_task.o block-obj-y += backup-top.o block-obj-y += filter-compress.o +block-obj-y += preallocate.o common-obj-y += monitor/block-obj-y += stream.o-- 2.18.0
-- Best regards, Vladimir