This patch is part of the Fast Virtual Disk (FVD) proposal. See http://wiki.qemu.org/Features/FVD.
This patch adds the support for aio_cancel into FVD. FVD faithfully cleans up all resources upon aio_cancel. Signed-off-by: Chunqiang Tang <ct...@us.ibm.com> --- block/fvd-journal-buf.c | 16 +++++++++++ block/fvd-load.c | 24 +++++++++++++++++ block/fvd-misc.c | 67 +++++++++++++++++++++++++++++++++++++++++++++++ block/fvd-read.c | 37 ++++++++++++++++++++++++++ block/fvd-store.c | 31 +++++++++++++++++++++ block/fvd-write.c | 23 +++++++++++++++- block/fvd.c | 25 +++++++++++++++++ 7 files changed, 222 insertions(+), 1 deletions(-) diff --git a/block/fvd-journal-buf.c b/block/fvd-journal-buf.c index e99a585..c6b60f9 100644 --- a/block/fvd-journal-buf.c +++ b/block/fvd-journal-buf.c @@ -360,6 +360,22 @@ use_current_buf: return s->bjnl.buf; } +static void fvd_aio_cancel_bjnl_flush(FvdAIOCB * acb) +{ + BlockDriverState *bs = acb->common.bs; + BDRVFvdState *s = bs->opaque; + QTAILQ_REMOVE(&s->bjnl.queued_bufs, acb, jcb.bjnl_next_queued_buf); + my_qemu_aio_release(acb); +} + +static void fvd_aio_cancel_bjnl_buf_write(FvdAIOCB * acb) +{ + /* OP_BJNL_BUF_WRITE is never exposed to any external entity, and this + * should not be invoked. Internal cancellation of OP_BJNL_BUF_WRITE + * is handled by bjnl_sync_flush(). */ + abort(); +} + static void bjnl_clean_buf_timer_cb(BlockDriverState * bs) { BDRVFvdState *s = bs->opaque; diff --git a/block/fvd-load.c b/block/fvd-load.c index 88e5fb4..9789cc5 100644 --- a/block/fvd-load.c +++ b/block/fvd-load.c @@ -188,6 +188,30 @@ static inline FvdAIOCB *init_load_acb(FvdAIOCB * parent_acb, return acb; } +static void fvd_aio_cancel_wrapper(FvdAIOCB * acb) +{ + qemu_bh_cancel(acb->wrapper.bh); + qemu_bh_delete(acb->wrapper.bh); + my_qemu_aio_release(acb); +} + +static void fvd_aio_cancel_load_compact(FvdAIOCB * acb) +{ + if (acb->load.children) { + int i; + for (i = 0; i < acb->load.num_children; i++) { + if (acb->load.children[i].hd_acb) { + bdrv_aio_cancel(acb->load.children[i].hd_acb); + } + } + my_qemu_free(acb->load.children); + } + if (acb->load.one_child.hd_acb) { + bdrv_aio_cancel(acb->load.one_child.hd_acb); + } + my_qemu_aio_release(acb); +} + static inline int load_create_one_child(bool count_only, bool empty, QEMUIOVector * orig_qiov, int *iov_index, size_t *iov_left, uint8_t **iov_buf, int64_t start_sec, int sectors_in_region, diff --git a/block/fvd-misc.c b/block/fvd-misc.c index f4e1038..a42bfac 100644 --- a/block/fvd-misc.c +++ b/block/fvd-misc.c @@ -11,6 +11,73 @@ * */ +static void fvd_aio_cancel_bjnl_buf_write(FvdAIOCB * acb); +static void fvd_aio_cancel_bjnl_flush(FvdAIOCB * acb); +static void fvd_aio_cancel_read(FvdAIOCB * acb); +static void fvd_aio_cancel_write(FvdAIOCB * acb); +static void fvd_aio_cancel_copy(FvdAIOCB * acb); +static void fvd_aio_cancel_load_compact(FvdAIOCB * acb); +static void fvd_aio_cancel_store_compact(FvdAIOCB * acb); +static void fvd_aio_cancel_wrapper(FvdAIOCB * acb); +static void flush_metadata_to_disk_on_exit (BlockDriverState *bs); + +static void fvd_aio_cancel_flush(FvdAIOCB * acb) +{ + if (acb->flush.data_acb) { + bdrv_aio_cancel(acb->flush.data_acb); + } + if (acb->flush.metadata_acb) { + bdrv_aio_cancel(acb->flush.metadata_acb); + } + my_qemu_aio_release(acb); +} + +static void fvd_aio_cancel(BlockDriverAIOCB * blockacb) +{ + FvdAIOCB *acb = container_of(blockacb, FvdAIOCB, common); + + QDEBUG("CANCEL: acb%llu-%p\n", acb->uuid, acb); + acb->cancel_in_progress = true; + + switch (acb->type) { + case OP_READ: + fvd_aio_cancel_read(acb); + break; + + case OP_WRITE: + fvd_aio_cancel_write(acb); + break; + + case OP_COPY: + fvd_aio_cancel_copy(acb); + break; + + case OP_LOAD_COMPACT: + fvd_aio_cancel_load_compact(acb); + break; + + case OP_STORE_COMPACT: + fvd_aio_cancel_store_compact(acb); + break; + + case OP_WRAPPER: + fvd_aio_cancel_wrapper(acb); + break; + + case OP_FLUSH: + fvd_aio_cancel_flush(acb); + break; + + case OP_BJNL_BUF_WRITE: + fvd_aio_cancel_bjnl_buf_write(acb); + break; + + case OP_BJNL_FLUSH: + fvd_aio_cancel_bjnl_flush(acb); + break; + } +} + static void fvd_close(BlockDriverState * bs) { } diff --git a/block/fvd-read.c b/block/fvd-read.c index 675af9e..b18fdf2 100644 --- a/block/fvd-read.c +++ b/block/fvd-read.c @@ -502,3 +502,40 @@ static inline void calc_read_region(BDRVFvdState * s, int64_t sector_num, *p_first_sec_in_backing = first_sec_in_backing; *p_last_sec_in_backing = last_sec_in_backing; } + +static void fvd_aio_cancel_read(FvdAIOCB * acb) +{ + if (acb->read.read_backing.hd_acb) { + bdrv_aio_cancel(acb->read.read_backing.hd_acb); + } + if (acb->read.read_fvd.hd_acb) { + bdrv_aio_cancel(acb->read.read_fvd.hd_acb); + } + if (acb->read.read_backing.iov.iov_base) { + my_qemu_vfree(acb->read.read_backing.iov.iov_base); + } + if (acb->read.read_fvd.iov.iov_base) { + my_qemu_vfree(acb->read.read_fvd.iov.iov_base); + } + my_qemu_aio_release(acb); +} + +static void fvd_aio_cancel_copy(FvdAIOCB * acb) +{ + BlockDriverState *bs = acb->common.bs; + BDRVFvdState *s = bs->opaque; + + if (acb->copy.hd_acb) { + bdrv_aio_cancel(acb->copy.hd_acb); + } + if (acb->copy_lock.next.le_prev != NULL) { + QLIST_REMOVE(acb, copy_lock.next); + restart_dependent_writes(acb); + } + my_qemu_vfree(acb->copy.buf); + if (acb->common.cb != prefetch_null_cb) { + /* This is a copy-on-read operation. */ + s->outstanding_copy_on_read_data -= acb->nb_sectors * 512; + } + my_qemu_aio_release(acb); +} diff --git a/block/fvd-store.c b/block/fvd-store.c index fe670eb..ec23fd7 100644 --- a/block/fvd-store.c +++ b/block/fvd-store.c @@ -477,3 +477,34 @@ static inline FvdAIOCB *init_store_acb(int soft_write, COPY_UUID(acb, parent_acb); return acb; } + +static void fvd_aio_cancel_store_compact(FvdAIOCB * acb) +{ + if (acb->store.children) { + int i; + for (i = 0; i < acb->store.num_children; i++) { + if (acb->store.children[i].hd_acb) { + bdrv_aio_cancel(acb->store.children[i].hd_acb); + } + } + my_qemu_free(acb->store.children); + } + if (acb->store.one_child.hd_acb) { + bdrv_aio_cancel(acb->store.one_child.hd_acb); + } + if (acb->jcb.hd_acb) { + bdrv_aio_cancel(acb->jcb.hd_acb); + BDRVFvdState *s = acb->common.bs->opaque; + if (!s->use_bjnl) { + ujnl_free_journal_sectors(acb->common.bs); + } + } + if (acb->jcb.iov.iov_base != NULL) { + my_qemu_vfree(acb->jcb.iov.iov_base); + } + if (acb->jcb.ujnl_next_wait4_recycle.le_prev) { + QLIST_REMOVE(acb, jcb.ujnl_next_wait4_recycle); + } + + my_qemu_aio_release(acb); +} diff --git a/block/fvd-write.c b/block/fvd-write.c index 623ec83..a74dc5d 100644 --- a/block/fvd-write.c +++ b/block/fvd-write.c @@ -15,7 +15,7 @@ static void write_metadata_to_journal(struct FvdAIOCB *acb, bool update_bitmap); static int do_aio_write(struct FvdAIOCB *acb); static void restart_dependent_writes(struct FvdAIOCB *acb); static void free_write_resource(struct FvdAIOCB *acb); -static void ujnl_free_journal_sectors(BlockDriverState * bs); +static void ujnl_free_journal_sectors(BlockDriverState *bs); static inline BlockDriverAIOCB *store_data(int soft_write, FvdAIOCB * parent_acb, BlockDriverState * bs, int64_t sector_num, QEMUIOVector * orig_qiov, int nb_sectors, @@ -106,6 +106,27 @@ slow_path: return &acb->common; } +static void fvd_aio_cancel_write(FvdAIOCB * acb) +{ + if (acb->write.hd_acb) { + bdrv_aio_cancel(acb->write.hd_acb); + } + if (acb->jcb.hd_acb) { + bdrv_aio_cancel(acb->jcb.hd_acb); + BDRVFvdState *s = acb->common.bs->opaque; + if (!s->use_bjnl) { + ujnl_free_journal_sectors(acb->common.bs); + } + } + if (acb->jcb.ujnl_next_wait4_recycle.le_prev) { + QLIST_REMOVE(acb, jcb.ujnl_next_wait4_recycle); + } + if (acb->write.next_dependent_write.le_prev) { + QLIST_REMOVE(acb, write.next_dependent_write); + } + free_write_resource(acb); +} + static void free_write_resource(FvdAIOCB * acb) { if (acb->write.next_write_lock.le_prev) { diff --git a/block/fvd.c b/block/fvd.c index 2402a94..c779d65 100644 --- a/block/fvd.c +++ b/block/fvd.c @@ -23,6 +23,16 @@ #include "block/fvd.h" +#define ENABLE_TRACE_IO +//#define DEBUG_MEMORY_LEAK + +#ifndef FVD_DEBUG +#undef DEBUG_MEMORY_LEAK +#endif +#ifndef ENABLE_QDEBUG +#undef ENABLE_TRACE_IO +#endif + /* Use include to avoid exposing too many FVD symbols, and to allow inline * function optimization. */ #include "block/fvd-debug.c" @@ -41,6 +51,11 @@ #include "block/fvd-prefetch.c" #include "block/fvd-update.c" +static AIOPool fvd_aio_pool = { + .aiocb_size = sizeof(FvdAIOCB), + .cancel = fvd_aio_cancel, +}; + static BlockDriver bdrv_fvd = { .format_name = "fvd", .instance_size = sizeof(BDRVFvdState), @@ -62,6 +77,8 @@ static BlockDriver bdrv_fvd = { static void bdrv_fvd_init(void) { + /* Random numbers are used in fvd-prefetch.c. */ + srand(time(NULL) + getpid() + getpid() * 987654 + rand()); bdrv_register(&bdrv_fvd); } @@ -84,3 +101,11 @@ extern QTAILQ_HEAD(, BlockDriverState) bdrv_states; } } } + +/* + * TODOs: + * - Cap the prefetch throughput at the upper limit. See Section 3.4.2 of + * the FVD-cow paper. Related metadata are + * FvdHeader.prefetch_max_read_throughput and + * FvdHeader.prefetch_max_write_throughput. + */ -- 1.7.0.4