On Thu, Apr 18, 2013 at 6:03 PM, Stefan Hajnoczi <stefa...@gmail.com> wrote: > On Wed, Apr 10, 2013 at 04:11:52PM +0800, Dong Xu Wang wrote: >> + header.cluster_bits = ffs(cluster_size) - 1; >> + if (header.cluster_bits < MIN_CLUSTER_BITS || >> + header.cluster_bits > MAX_CLUSTER_BITS || >> + (1 << header.cluster_bits) != cluster_size) { >> + error_report( >> + "Cluster size must be a power of two between %d and %dk", >> + 1 << MIN_CLUSTER_BITS, 1 << (MAX_CLUSTER_BITS - 10)); >> + return -EINVAL; >> + } >> + >> + header.header_size = MAX(cluster_size, DEFAULT_HEADER_SIZE); > > Indentation. > >> + if (backing_filename) { >> + header.backing_offset = sizeof(header); >> + header.backing_size = strlen(backing_filename); >> + >> + if (!backing_fmt) { >> + backing_bs = bdrv_new("image"); >> + ret = bdrv_open(backing_bs, backing_filename, NULL, >> + BDRV_O_RDWR | BDRV_O_CACHE_WB, NULL); >> + if (ret < 0) { >> + return ret; > > backing_bs is leaked. > >> + ret = bdrv_file_open(&bs, filename, NULL, BDRV_O_RDWR); >> + if (ret < 0) { >> + return ret; >> + } >> + snprintf(header.backing_fmt, sizeof(header.backing_fmt), "%s", >> + backing_fmt ? backing_fmt : ""); >> + snprintf(header.image_fmt, sizeof(header.image_fmt), "%s", >> + image_format ? image_format : "raw"); > > snprintf() doesn't have the semantics in the add-cow specification: > > " 44 - 59: backing file format > Format of backing file. It will be filled with > 0 if backing file name offset is 0. If backing > file name offset is non-empty, it must be > non-empty. It is coded in free-form ASCII, and > is not NUL-terminated. Zero padded on the right. > > 60 - 75: image file format > Format of image file. It must be non-empty. It > is coded in free-form ASCII, and is not > NUL-terminated. Zero padded on the right." > > strncpy() does the zero padding and doesn't NUL-terminate if the max buffer > size is used. > >> + if ((s->header.compat_features & ACOW_F_ALL_ALLOCATED) == 0) { >> + snprintf(bs->backing_format, sizeof(bs->backing_format), >> + "%s", s->header.backing_fmt); > > s->header.backing_fmt is not NUL-terminated so using snprintf() is > inappropriate (could it read beyond the end of .backing_fmt?). > >> + } >> + >> + if (s->header.cluster_bits < MIN_CLUSTER_BITS || >> + s->header.cluster_bits > MAX_CLUSTER_BITS) { >> + ret = -EINVAL; >> + goto fail; >> + } >> + >> + s->cluster_size = 1 << s->header.cluster_bits; >> + if (s->header.header_size != MAX(s->cluster_size, DEFAULT_HEADER_SIZE)) >> { >> + char buf[64]; >> + snprintf(buf, sizeof(buf), "Header size: %d", > > %u or PRIu32 since header_size is uint32_t. This avoids compiler or > code scanner warnings. > >> + s->image_hd = bdrv_new(""); >> + ret = bdrv_open(s->image_hd, image_filename, NULL, flags, >> + bdrv_find_format(s->header.image_fmt)); > > Cannot use image_fmt as a string since it is not NUL-terminated. > >> +static coroutine_fn int add_cow_co_writev(BlockDriverState *bs, >> + int64_t sector_num, >> + int remaining_sectors, >> + QEMUIOVector *qiov) >> +{ >> + BDRVAddCowState *s = bs->opaque; >> + int ret = 0, i; >> + QEMUIOVector hd_qiov; >> + uint8_t *table; >> + uint64_t offset; >> + int mask = s->cluster_sectors - 1; >> + int cluster_mask = s->cluster_size - 1; >> + >> + qemu_co_mutex_lock(&s->lock); >> + qemu_iovec_init(&hd_qiov, qiov->niov); >> + ret = bdrv_co_writev(s->image_hd, sector_num, >> + remaining_sectors, qiov); > > All writes are serialized. This means write performance will be very > poor for multi-threaded workloads. > > qcow2 tracks allocating writes and allows them to execute at the same > time if they do not overlap clusters. > >> + >> + if (ret < 0) { >> + goto fail; >> + } >> + if ((s->header.compat_features & ACOW_F_ALL_ALLOCATED) == 0) { >> + /* Copy content of unmodified sectors */ >> + if (!is_cluster_head(sector_num, s->cluster_sectors) >> + && !is_allocated(bs, sector_num)) { >> + ret = copy_sectors(bs, sector_num & ~mask, sector_num); >> + if (ret < 0) { >> + goto fail; >> + } >> + } >> + >> + if (!is_cluster_tail(sector_num + remaining_sectors - 1, >> + s->cluster_sectors) >> + && !is_allocated(bs, sector_num + remaining_sectors - 1)) { >> + ret = copy_sectors(bs, sector_num + remaining_sectors, >> + ((sector_num + remaining_sectors) | mask) + >> 1); >> + if (ret < 0) { >> + goto fail; >> + } >> + } > > This trashes the cluster when remaining_sectors = 0, sector_num = > cluster_sectors, and sector cluster_sectors - 1 is unallocated. > > Probably best to return early when remaining_sectors == 0. > >> + >> + for (i = sector_num / s->cluster_sectors; >> + i <= (sector_num + remaining_sectors - 1) / s->cluster_sectors; >> + i++) { >> + offset = s->header.header_size >> + + (offset_in_bitmap(i * s->cluster_sectors, >> + s->cluster_sectors) & (~cluster_mask)); >> + ret = block_cache_get(bs, s->bitmap_cache, offset, (void >> **)&table); >> + if (ret < 0) { >> + goto fail; >> + } >> + if ((table[i / 8] & (1 << (i % 8))) == 0) { >> + table[i / 8] |= (1 << (i % 8)); > > i is based on sector_num while table[] starts at offset, not sector 0. > The index expression i / 8 leads to out-of-bounds accesses. > > I think you forgot to & (s->cluster_sectors - 1). > >> +static coroutine_fn int add_cow_co_flush(BlockDriverState *bs) >> +{ >> + BDRVAddCowState *s = bs->opaque; >> + int ret; >> + >> + qemu_co_mutex_lock(&s->lock); >> + if (s->bitmap_cache) { >> + ret = block_cache_flush(bs, s->bitmap_cache); >> + if (ret < 0) { >> + return ret; >> + } >> + } >> + ret = bdrv_flush(s->image_hd); > > This is the wrong way around. We must flush image_hd first so that > valid data is on disk. Then we can flush bitmap_cache to mark the > clusters allocated. > > Beyond explicit flushes you also need to make sure that image_hd is > flushed *before* bitmap_cache tables are written out (e.g. cache > eviction when the cache becomes full). It seems this code is missing. > Hi Stefan, how can I make sure image_hd "flush" operation completed before I write bitmap_cache tables? Is there any functions I can use?
Thank you. > Also please use bdrv_co_flush() instead of bdrv_flush() in > add_cow_co_flush() since this is a coroutine function. > >> diff --git a/block/block-cache.c b/block/block-cache.c >> index 3544691..4824632 100644 >> --- a/block/block-cache.c >> +++ b/block/block-cache.c >> @@ -130,7 +130,7 @@ static int block_cache_entry_flush(BlockDriverState *bs, >> BlockCache *c, int i) >> } else if (c->table_type == BLOCK_TABLE_L2) { >> BLKDBG_EVENT(bs->file, BLKDBG_L2_UPDATE); >> } else if (c->table_type == BLOCK_TABLE_BITMAP) { >> - BLKDBG_EVENT(bs->file, BLKDBG_COW_WRITE); >> + BLKDBG_EVENT(bs->file, BLKDBG_L2_LOAD); >> } >> >> ret = bdrv_pwrite(bs->file, c->entries[i].offset, >> @@ -265,7 +265,7 @@ static int block_cache_do_get(BlockDriverState *bs, >> BlockCache *c, >> if (c->table_type == BLOCK_TABLE_L2) { >> BLKDBG_EVENT(bs->file, BLKDBG_L2_LOAD); >> } else if (c->table_type == BLOCK_TABLE_BITMAP) { >> - BLKDBG_EVENT(bs->file, BLKDBG_COW_READ); >> + BLKDBG_EVENT(bs->file, BLKDBG_L2_LOAD); >> } >> >> ret = bdrv_pread(bs->file, offset, c->entries[i].table, > > I commented on this in the previous patch. Please squash this fix into > the previous patch. > >> diff --git a/block/qcow2.h b/block/qcow2.h >> index e7f6aec..a4e514b 100644 >> --- a/block/qcow2.h >> +++ b/block/qcow2.h >> @@ -98,6 +98,9 @@ typedef struct QCowSnapshot { >> uint64_t vm_clock_nsec; >> } QCowSnapshot; >> >> +struct BlockCache; >> +typedef struct BlockCache BlockCache; >> + >> typedef struct Qcow2UnknownHeaderExtension { >> uint32_t magic; >> uint32_t len; >> @@ -389,7 +392,4 @@ int qcow2_snapshot_load_tmp(BlockDriverState *bs, const >> char *snapshot_name); >> void qcow2_free_snapshots(BlockDriverState *bs); >> int qcow2_read_snapshots(BlockDriverState *bs); >> >> -/* qcow2-cache.c functions */ >> - >> - > > More qcow2-cache.c move cleanups? Please squash into the previous > patch. >