Introduce an interface to make a "critical section" on top of serialising requests feature. This is needed to avoid intersecting with other requests during a set of operations. Will be used in a next commit to implement preallocate filter.
To keep assertions during intersecting requests handling, we need to add _locked versions of read/write requests, to be used inside locked section. The following commit will need only locked version of write-zeroes, so add only it. Signed-off-by: Vladimir Sementsov-Ogievskiy <vsement...@virtuozzo.com> --- include/block/block.h | 9 ++++ include/block/block_int.h | 16 ++++++++ include/qemu/typedefs.h | 1 + block/io.c | 86 +++++++++++++++++++++++++++++++-------- 4 files changed, 95 insertions(+), 17 deletions(-) diff --git a/include/block/block.h b/include/block/block.h index 25e299605e..c4b2a493b4 100644 --- a/include/block/block.h +++ b/include/block/block.h @@ -391,6 +391,15 @@ int bdrv_pwrite_sync(BdrvChild *child, int64_t offset, */ int coroutine_fn bdrv_co_pwrite_zeroes(BdrvChild *child, int64_t offset, int bytes, BdrvRequestFlags flags); + +/* + * Version of bdrv_co_pwrite_zeroes to be used inside bdrv_co_range_try_lock() + * .. bdrv_co_range_unlock() pair. + */ +int coroutine_fn bdrv_co_pwrite_zeroes_locked(BdrvChild *child, + int64_t offset, int bytes, BdrvRequestFlags flags, + BdrvTrackedRequest *lock); + BlockDriverState *bdrv_find_backing_image(BlockDriverState *bs, const char *backing_file); void bdrv_refresh_filename(BlockDriverState *bs); diff --git a/include/block/block_int.h b/include/block/block_int.h index 791de6a59c..9ec56f1825 100644 --- a/include/block/block_int.h +++ b/include/block/block_int.h @@ -66,6 +66,7 @@ enum BdrvTrackedRequestType { BDRV_TRACKED_WRITE, BDRV_TRACKED_DISCARD, BDRV_TRACKED_TRUNCATE, + BDRV_TRACKED_LOCK, }; typedef struct BdrvTrackedRequest { @@ -83,6 +84,12 @@ typedef struct BdrvTrackedRequest { CoQueue wait_queue; /* coroutines blocked on this request */ struct BdrvTrackedRequest *waiting_for; + + /* + * If non-zero, the request is under lock, so it's allowed to intersect + * (actully it must be inside) the @lock request. + */ + struct BdrvTrackedRequest *lock; } BdrvTrackedRequest; struct BlockDriver { @@ -994,6 +1001,15 @@ int coroutine_fn bdrv_co_pwritev_part(BdrvChild *child, int64_t offset, unsigned int bytes, QEMUIOVector *qiov, size_t qiov_offset, BdrvRequestFlags flags); +/* + * Creates serializing BDRV_TRACKED_LOCK request if no conflicts, on success + * return pointer to it, to be used in bdrv_co_range_unlock(). + */ +BdrvTrackedRequest *coroutine_fn bdrv_co_range_try_lock(BlockDriverState *bs, + int64_t offset, + int64_t bytes); +void coroutine_fn bdrv_co_range_unlock(BdrvTrackedRequest *req); + static inline int coroutine_fn bdrv_co_pread(BdrvChild *child, int64_t offset, unsigned int bytes, void *buf, BdrvRequestFlags flags) { diff --git a/include/qemu/typedefs.h b/include/qemu/typedefs.h index ecf3cde26c..e2bbe3af96 100644 --- a/include/qemu/typedefs.h +++ b/include/qemu/typedefs.h @@ -28,6 +28,7 @@ typedef struct Aml Aml; typedef struct AnnounceTimer AnnounceTimer; typedef struct BdrvDirtyBitmap BdrvDirtyBitmap; typedef struct BdrvDirtyBitmapIter BdrvDirtyBitmapIter; +typedef struct BdrvTrackedRequest BdrvTrackedRequest; typedef struct BlockBackend BlockBackend; typedef struct BlockBackendRootState BlockBackendRootState; typedef struct BlockDriverState BlockDriverState; diff --git a/block/io.c b/block/io.c index 60cee1d759..0de72cdb4c 100644 --- a/block/io.c +++ b/block/io.c @@ -691,7 +691,8 @@ static void tracked_request_begin(BdrvTrackedRequest *req, BlockDriverState *bs, int64_t offset, uint64_t bytes, - enum BdrvTrackedRequestType type) + enum BdrvTrackedRequestType type, + BdrvTrackedRequest *lock) { assert(bytes <= INT64_MAX && offset <= INT64_MAX - bytes); @@ -704,6 +705,7 @@ static void tracked_request_begin(BdrvTrackedRequest *req, .serialising = false, .overlap_offset = offset, .overlap_bytes = bytes, + .lock = lock, }; qemu_co_queue_init(&req->wait_queue); @@ -745,15 +747,26 @@ static bool coroutine_fn wait_or_find_conflicts(BdrvTrackedRequest *self, if (tracked_request_overlaps(req, self->overlap_offset, self->overlap_bytes)) { - /* Hitting this means there was a reentrant request, for - * example, a block driver issuing nested requests. This must - * never happen since it means deadlock. + if (self->lock == req) { + /* This is atomic request under range_lock */ + assert(req->type == BDRV_TRACKED_LOCK); + assert(self->offset >= req->offset); + assert(self->bytes <= req->bytes); + continue; + } + /* + * Hitting this means there was a reentrant request (except for + * range_lock, handled above), for example, a block driver + * issuing nested requests. This must never happen since it + * means deadlock. */ assert(qemu_coroutine_self() != req->co); - /* If the request is already (indirectly) waiting for us, or + /* + * If the request is already (indirectly) waiting for us, or * will wait for us as soon as it wakes up, then just go on - * (instead of producing a deadlock in the former case). */ + * (instead of producing a deadlock in the former case). + */ if (!req->waiting_for) { if (!wait) { return true; @@ -821,7 +834,6 @@ bool bdrv_mark_request_serialising(BdrvTrackedRequest *req, uint64_t align) } /* Return true on success, false if there are some conflicts */ -__attribute__ ((unused)) static bool bdrv_try_mark_request_serialising(BdrvTrackedRequest *req, uint64_t align) { @@ -1796,7 +1808,7 @@ int coroutine_fn bdrv_co_preadv_part(BdrvChild *child, bdrv_pad_request(bs, &qiov, &qiov_offset, &offset, &bytes, &pad); - tracked_request_begin(&req, bs, offset, bytes, BDRV_TRACKED_READ); + tracked_request_begin(&req, bs, offset, bytes, BDRV_TRACKED_READ, NULL); ret = bdrv_aligned_preadv(child, &req, offset, bytes, bs->bl.request_alignment, qiov, qiov_offset, flags); @@ -2169,9 +2181,9 @@ int coroutine_fn bdrv_co_pwritev(BdrvChild *child, return bdrv_co_pwritev_part(child, offset, bytes, qiov, 0, flags); } -int coroutine_fn bdrv_co_pwritev_part(BdrvChild *child, +static int coroutine_fn bdrv_co_pwritev_part_locked(BdrvChild *child, int64_t offset, unsigned int bytes, QEMUIOVector *qiov, size_t qiov_offset, - BdrvRequestFlags flags) + BdrvRequestFlags flags, BdrvTrackedRequest *lock) { BlockDriverState *bs = child->bs; BdrvTrackedRequest req; @@ -2215,7 +2227,7 @@ int coroutine_fn bdrv_co_pwritev_part(BdrvChild *child, * Pad qiov with the read parts and be sure to have a tracked request not * only for bdrv_aligned_pwritev, but also for the reads of the RMW cycle. */ - tracked_request_begin(&req, bs, offset, bytes, BDRV_TRACKED_WRITE); + tracked_request_begin(&req, bs, offset, bytes, BDRV_TRACKED_WRITE, lock); if (flags & BDRV_REQ_ZERO_WRITE) { ret = bdrv_co_do_zero_pwritev(child, offset, bytes, flags, &req); @@ -2239,8 +2251,23 @@ out: return ret; } +int coroutine_fn bdrv_co_pwritev_part(BdrvChild *child, + int64_t offset, unsigned int bytes, QEMUIOVector *qiov, size_t qiov_offset, + BdrvRequestFlags flags) +{ + return bdrv_co_pwritev_part_locked(child, offset, bytes, qiov, qiov_offset, + flags, NULL); +} + int coroutine_fn bdrv_co_pwrite_zeroes(BdrvChild *child, int64_t offset, int bytes, BdrvRequestFlags flags) +{ + return bdrv_co_pwrite_zeroes_locked(child, offset, bytes, flags, NULL); +} + +int coroutine_fn bdrv_co_pwrite_zeroes_locked(BdrvChild *child, int64_t offset, + int bytes, BdrvRequestFlags flags, + BdrvTrackedRequest *lock) { trace_bdrv_co_pwrite_zeroes(child->bs, offset, bytes, flags); @@ -2248,8 +2275,8 @@ int coroutine_fn bdrv_co_pwrite_zeroes(BdrvChild *child, int64_t offset, flags &= ~BDRV_REQ_MAY_UNMAP; } - return bdrv_co_pwritev(child, offset, bytes, NULL, - BDRV_REQ_ZERO_WRITE | flags); + return bdrv_co_pwritev_part_locked(child, offset, bytes, NULL, 0, + BDRV_REQ_ZERO_WRITE | flags, lock); } /* @@ -2980,7 +3007,7 @@ int coroutine_fn bdrv_co_pdiscard(BdrvChild *child, int64_t offset, tail = (offset + bytes) % align; bdrv_inc_in_flight(bs); - tracked_request_begin(&req, bs, offset, bytes, BDRV_TRACKED_DISCARD); + tracked_request_begin(&req, bs, offset, bytes, BDRV_TRACKED_DISCARD, NULL); ret = bdrv_co_write_req_prepare(child, offset, bytes, &req, 0); if (ret < 0) { @@ -3252,7 +3279,7 @@ static int coroutine_fn bdrv_co_copy_range_internal( if (recurse_src) { bdrv_inc_in_flight(src->bs); tracked_request_begin(&req, src->bs, src_offset, bytes, - BDRV_TRACKED_READ); + BDRV_TRACKED_READ, NULL); /* BDRV_REQ_SERIALISING is only for write operation */ assert(!(read_flags & BDRV_REQ_SERIALISING)); @@ -3269,7 +3296,7 @@ static int coroutine_fn bdrv_co_copy_range_internal( } else { bdrv_inc_in_flight(dst->bs); tracked_request_begin(&req, dst->bs, dst_offset, bytes, - BDRV_TRACKED_WRITE); + BDRV_TRACKED_WRITE, NULL); ret = bdrv_co_write_req_prepare(dst, dst_offset, bytes, &req, write_flags); if (!ret) { @@ -3381,7 +3408,7 @@ int coroutine_fn bdrv_co_truncate(BdrvChild *child, int64_t offset, bool exact, bdrv_inc_in_flight(bs); tracked_request_begin(&req, bs, offset - new_bytes, new_bytes, - BDRV_TRACKED_TRUNCATE); + BDRV_TRACKED_TRUNCATE, NULL); /* If we are growing the image and potentially using preallocation for the * new area, we need to make sure that no write requests are made to it @@ -3494,3 +3521,28 @@ int bdrv_truncate(BdrvChild *child, int64_t offset, bool exact, return bdrv_run_co(child->bs, bdrv_truncate_co_entry, &tco); } + +BdrvTrackedRequest *coroutine_fn bdrv_co_range_try_lock(BlockDriverState *bs, + int64_t offset, + int64_t bytes) +{ + BdrvTrackedRequest *req = g_new(BdrvTrackedRequest, 1); + bool success; + + tracked_request_begin(req, bs, offset, bytes, BDRV_TRACKED_LOCK, NULL); + success = bdrv_try_mark_request_serialising(req, bdrv_get_cluster_size(bs)); + if (!success) { + g_free(req); + req = NULL; + } + + return req; +} + +void coroutine_fn bdrv_co_range_unlock(BdrvTrackedRequest *req) +{ + assert(req->type == BDRV_TRACKED_LOCK); + + tracked_request_end(req); + g_free(req); +} -- 2.18.0