On Tue, May 29, 2012 at 11:50 PM, Stefan Hajnoczi <stefa...@gmail.com> wrote: >> + image_sectors = image_bs->total_sectors; > > Please use bdrv_getlength() instead of accessing total_sectors directly. > >> + image_drv = bdrv_find_format("raw"); >> + ret = bdrv_open(s->image_hd, image_filename, flags, image_drv); >> + if (ret < 0) { >> + bdrv_delete(s->image_hd); >> + goto fail; >> + } >> + bs->total_sectors = s->image_hd->total_sectors; > > bdrv_getlength() Okay. > >> +static coroutine_fn int add_cow_is_allocated(BlockDriverState *bs, >> + int64_t sector_num, int nb_sectors, int *num_same) >> +{ >> + int changed; >> + int64_t cluster_num; >> + >> + if (nb_sectors == 0) { >> + *num_same = 0; >> + return 0; >> + } >> + >> + cluster_num = sector_num / SECTORS_PER_CLUSTER; >> + changed = is_allocated(bs, sector_num); > > Do we need to hold the lock before (indirectly) accessing the cache? > >> + *num_same = >> + MIN(nb_sectors, (cluster_num + 1) * SECTORS_PER_CLUSTER - >> sector_num); >> + >> + for (cluster_num = sector_num / SECTORS_PER_CLUSTER + 1; >> + cluster_num <= (sector_num + nb_sectors - 1) / SECTORS_PER_CLUSTER; >> + cluster_num++) { >> + if (is_allocated(bs, cluster_num * SECTORS_PER_CLUSTER) != changed) >> { >> + break; >> + } >> + *num_same = MIN(nb_sectors, >> + (cluster_num + 1) * SECTORS_PER_CLUSTER - sector_num); >> + } > > I think this makes sense but it would be easier to use a loop counter > that is in sector units instead of clusters. Then you can calculate > *num_name after the loop simply by subtracting the starting sector_num > from the final cur_sector value. And it saves you from constantly > converting back and forth between clusters and sectors. > >> +static coroutine_fn int add_cow_co_writev(BlockDriverState *bs, >> + int64_t sector_num, int remaining_sectors, QEMUIOVector *qiov) >> +{ >> + BDRVAddCowState *s = bs->opaque; >> + int ret = 0, i; >> + QEMUIOVector hd_qiov; >> + uint8_t *table; >> + bool changed = false; >> + >> + qemu_co_mutex_lock(&s->lock); >> + qemu_iovec_init(&hd_qiov, qiov->niov); >> + ret = bdrv_co_writev(s->image_hd, >> + sector_num, >> + remaining_sectors, qiov); > > We don't need to lock across all writes. > > lock() > if write requires allocation: > ...do allocation stuff under lock... > unlock() > write data > > Internal data structure (cache, metadata, and copying unmodified > sectors) access needs to be locked during allocating writes. The > guest's data can be written without the lock held. > > This means that most writes will only lock for a short time to check > that the clusters are already allocated. Then they will be able to > write in parallel. > > If any cluster is not yet allocated then the allocation needs to > happen under lock. This ensures that we don't copy backing file > sectors while processing another write request that touches those > sectors. > >> + >> + if (ret < 0) { >> + goto fail; >> + } >> + /* copy content of unmodified sectors */ >> + if (!is_cluster_head(sector_num) && !is_allocated(bs, sector_num)) { >> + qemu_co_mutex_unlock(&s->lock); >> + ret = copy_sectors(bs, sector_num & ~(SECTORS_PER_CLUSTER - 1), >> + sector_num); > > As mentioned above, I think we need to lock during copy_sectors() so > that other requests cannot race with this. (The guest's writes must > always take priority over copying unmodified sectors.) > >> + qemu_co_mutex_lock(&s->lock); >> + if (ret < 0) { >> + goto fail; >> + } >> + } >> + >> + if (!is_cluster_tail(sector_num + remaining_sectors - 1) >> + && !is_allocated(bs, sector_num + remaining_sectors - 1)) { >> + qemu_co_mutex_unlock(&s->lock); >> + ret = copy_sectors(bs, sector_num + remaining_sectors, >> + ((sector_num + remaining_sectors) | (SECTORS_PER_CLUSTER - 1)) >> + 1); >> + qemu_co_mutex_lock(&s->lock); >> + if (ret < 0) { >> + goto fail; >> + } >> + } >> + >> + for (i = sector_num / SECTORS_PER_CLUSTER; >> + i <= (sector_num + remaining_sectors - 1) / SECTORS_PER_CLUSTER; >> + i++) { >> + ret = add_cow_cache_get(bs, s->bitmap_cache, >> + i * SECTORS_PER_CLUSTER, (void **)&table); >> + if (ret < 0) { >> + return 0; > > return ret? Yes.. > >> + } >> + if ((table[i / 8] & (1 << (i % 8))) == 0) { >> + table[i / 8] |= (1 << (i % 8)); >> + changed = true; >> + add_cow_cache_entry_mark_dirty(s->bitmap_cache, table); >> + } >> + >> + } >> + ret = 0; >> +fail: >> + if (changed) { >> + ret = add_cow_cache_flush(bs, s->bitmap_cache); >> + } >> + qemu_co_mutex_unlock(&s->lock); >> + qemu_iovec_destroy(&hd_qiov); >> + return ret; >> +} >> + >> +static int bdrv_add_cow_truncate(BlockDriverState *bs, int64_t size) >> +{ >> + BDRVAddCowState *s = bs->opaque; >> + int sector_per_byte = SECTORS_PER_CLUSTER * 8; >> + int ret; >> + int64_t old_image_sector = s->image_hd->total_sectors; >> + int64_t bitmap_size = >> + (size / BDRV_SECTOR_SIZE + sector_per_byte - 1) / sector_per_byte; >> + >> + ret = bdrv_truncate(bs->file, >> + sizeof(AddCowHeader) + bitmap_size); >> + if (ret < 0) { >> + bdrv_truncate(s->image_hd, old_image_sector * BDRV_SECTOR_SIZE); > > Why truncate image_hd on failure? We never touch the image_hd size on > success either. I think we can just leave it alone.
That means whether we truncate add-cow fails or not ,we should not never touch image_hd size? > >> +typedef struct AddCowHeader { >> + uint64_t magic; >> + uint32_t version; >> + char backing_file[ADD_COW_FILE_LEN]; >> + char image_file[ADD_COW_FILE_LEN]; >> + char reserved[500]; >> +} QEMU_PACKED AddCowHeader; > > I'd be tempted to start the bitmap at 4096 bytes into the file. On 4 > KB block size disks that would be a nice alignment and it doesn't > waste much additional space (the disk images we're trying to manage > are many GB :)). Okay. > >> @@ -828,6 +832,41 @@ static int img_convert(int argc, char **argv) >> } >> >> /* Create the new image */ >> + >> + if (0 == strcmp(out_fmt, "add-cow")) { >> + image_drv = bdrv_find_format("raw"); >> + if (!drv) { >> + ret = -1; >> + goto out; >> + } >> + snprintf(image_filename, sizeof(image_filename), >> + "%s"".ct.raw", out_filename); >> + ret = bdrv_create(image_drv, image_filename, image_param); >> + if (ret < 0) { >> + error_report("%s: error while creating image_file: %s", >> + image_filename, strerror(-ret)); >> + goto out; >> + } >> + set_option_parameter(param, BLOCK_OPT_IMAGE_FILE, image_filename); >> + >> + if (!out_baseimg) { >> + backing_drv = bdrv_find_format("qcow2"); >> + if (!drv) { >> + ret = -1; >> + goto out; >> + } >> + snprintf(backing_filename, sizeof(backing_filename), >> + "%s"".ct.qcow2", out_filename); >> + ret = bdrv_create(backing_drv, backing_filename, image_param); >> + if (ret < 0) { >> + error_report("%s: error while creating backing_file: %s", >> + backing_filename, strerror(-ret)); >> + goto out; >> + } >> + set_option_parameter(param, BLOCK_OPT_BACKING_FILE, >> + backing_filename); >> + } >> + } > > If this diff hunk is dropped then the user needs to manually create > the raw file before running qemu-img convert? > > qemu-img convert -O add-cow seems like a very rare case. I'm not sure > we should add special user-friend hacks for this. > > I'm not sure I understand why you create a qcow2 file either. > Yes, if we use "qemu-img convert -O add-cow", we should create 2 other files, raw file and qcow2(I just picked up qcow2, other formats is also okay) file, as image_file and backing_file, without the two files, .add-cow file can not work properly. Although it will occour in very rare cases, I wish to pass all qemu-iotests cases, so I added these code. Do you think these are not necessary? And some qemu-iotests cases are using "convert" operation, If I do not write previous code, these cases will fail. Can I let these cases do not support add-cow? Really thanks for your review, Stefan. > Stefan >