Re: [Qemu-devel] [PATCH 1/3 v9] add-cow file format

Dong Xu Wang Tue, 29 May 2012 18:51:38 -0700

On Tue, May 29, 2012 at 11:50 PM, Stefan Hajnoczi <stefa...@gmail.com> wrote:
>> +    image_sectors = image_bs->total_sectors;
>
> Please use bdrv_getlength() instead of accessing total_sectors directly.
>
>> +    image_drv = bdrv_find_format("raw");
>> +    ret = bdrv_open(s->image_hd, image_filename, flags, image_drv);
>> +    if (ret < 0) {
>> +        bdrv_delete(s->image_hd);
>> +        goto fail;
>> +    }
>> +    bs->total_sectors = s->image_hd->total_sectors;
>
> bdrv_getlength()
Okay.
>
>> +static coroutine_fn int add_cow_is_allocated(BlockDriverState *bs,
>> +        int64_t sector_num, int nb_sectors, int *num_same)
>> +{
>> +    int changed;
>> +    int64_t cluster_num;
>> +
>> +    if (nb_sectors == 0) {
>> +        *num_same = 0;
>> +        return 0;
>> +    }
>> +
>> +    cluster_num = sector_num / SECTORS_PER_CLUSTER;
>> +    changed = is_allocated(bs, sector_num);
>
> Do we need to hold the lock before (indirectly) accessing the cache?
>
>> +    *num_same =
>> +        MIN(nb_sectors, (cluster_num + 1) * SECTORS_PER_CLUSTER - 
>> sector_num);
>> +
>> +    for (cluster_num = sector_num / SECTORS_PER_CLUSTER + 1;
>> +        cluster_num <= (sector_num + nb_sectors - 1) / SECTORS_PER_CLUSTER;
>> +        cluster_num++) {
>> +        if (is_allocated(bs, cluster_num * SECTORS_PER_CLUSTER) != changed) 
>> {
>> +            break;
>> +        }
>> +        *num_same = MIN(nb_sectors,
>> +            (cluster_num + 1) * SECTORS_PER_CLUSTER - sector_num);
>> +    }
>
> I think this makes sense but it would be easier to use a loop counter
> that is in sector units instead of clusters.  Then you can calculate
> *num_name after the loop simply by subtracting the starting sector_num
> from the final cur_sector value.  And it saves you from constantly
> converting back and forth between clusters and sectors.
>
>> +static coroutine_fn int add_cow_co_writev(BlockDriverState *bs,
>> +        int64_t sector_num, int remaining_sectors, QEMUIOVector *qiov)
>> +{
>> +    BDRVAddCowState *s = bs->opaque;
>> +    int ret = 0, i;
>> +    QEMUIOVector hd_qiov;
>> +    uint8_t *table;
>> +    bool changed = false;
>> +
>> +    qemu_co_mutex_lock(&s->lock);
>> +    qemu_iovec_init(&hd_qiov, qiov->niov);
>> +    ret = bdrv_co_writev(s->image_hd,
>> +                     sector_num,
>> +                     remaining_sectors, qiov);
>
> We don't need to lock across all writes.
>
> lock()
> if write requires allocation:
>    ...do allocation stuff under lock...
> unlock()
> write data
>
> Internal data structure (cache, metadata, and copying unmodified
> sectors) access needs to be locked during allocating writes.  The
> guest's data can be written without the lock held.
>
> This means that most writes will only lock for a short time to check
> that the clusters are already allocated.  Then they will be able to
> write in parallel.
>
> If any cluster is not yet allocated then the allocation needs to
> happen under lock.  This ensures that we don't copy backing file
> sectors while processing another write request that touches those
> sectors.
>
>> +
>> +    if (ret < 0) {
>> +        goto fail;
>> +    }
>> +    /* copy content of unmodified sectors */
>> +    if (!is_cluster_head(sector_num) && !is_allocated(bs, sector_num)) {
>> +        qemu_co_mutex_unlock(&s->lock);
>> +        ret = copy_sectors(bs, sector_num & ~(SECTORS_PER_CLUSTER - 1),
>> +            sector_num);
>
> As mentioned above, I think we need to lock during copy_sectors() so
> that other requests cannot race with this.  (The guest's writes must
> always take priority over copying unmodified sectors.)
>
>> +        qemu_co_mutex_lock(&s->lock);
>> +        if (ret < 0) {
>> +            goto fail;
>> +        }
>> +    }
>> +
>> +    if (!is_cluster_tail(sector_num + remaining_sectors - 1)
>> +        && !is_allocated(bs, sector_num + remaining_sectors - 1)) {
>> +        qemu_co_mutex_unlock(&s->lock);
>> +        ret = copy_sectors(bs, sector_num + remaining_sectors,
>> +            ((sector_num + remaining_sectors) | (SECTORS_PER_CLUSTER - 1)) 
>> + 1);
>> +        qemu_co_mutex_lock(&s->lock);
>> +        if (ret < 0) {
>> +            goto fail;
>> +        }
>> +    }
>> +
>> +    for (i = sector_num / SECTORS_PER_CLUSTER;
>> +        i <= (sector_num + remaining_sectors - 1) / SECTORS_PER_CLUSTER;
>> +        i++) {
>> +        ret = add_cow_cache_get(bs, s->bitmap_cache,
>> +            i * SECTORS_PER_CLUSTER, (void **)&table);
>> +        if (ret < 0) {
>> +            return 0;
>
> return ret?
Yes..
>
>> +        }
>> +        if ((table[i / 8] & (1 << (i % 8))) == 0) {
>> +            table[i / 8] |= (1 << (i % 8));
>> +            changed = true;
>> +            add_cow_cache_entry_mark_dirty(s->bitmap_cache, table);
>> +        }
>> +
>> +    }
>> +    ret = 0;
>> +fail:
>> +    if (changed) {
>> +        ret = add_cow_cache_flush(bs, s->bitmap_cache);
>> +    }
>> +    qemu_co_mutex_unlock(&s->lock);
>> +    qemu_iovec_destroy(&hd_qiov);
>> +    return ret;
>> +}
>> +
>> +static int bdrv_add_cow_truncate(BlockDriverState *bs, int64_t size)
>> +{
>> +    BDRVAddCowState *s = bs->opaque;
>> +    int sector_per_byte = SECTORS_PER_CLUSTER * 8;
>> +    int ret;
>> +    int64_t old_image_sector = s->image_hd->total_sectors;
>> +    int64_t bitmap_size =
>> +        (size / BDRV_SECTOR_SIZE + sector_per_byte - 1) / sector_per_byte;
>> +
>> +    ret = bdrv_truncate(bs->file,
>> +        sizeof(AddCowHeader) + bitmap_size);
>> +    if (ret < 0) {
>> +        bdrv_truncate(s->image_hd, old_image_sector * BDRV_SECTOR_SIZE);
>
> Why truncate image_hd on failure?  We never touch the image_hd size on
> success either.  I think we can just leave it alone.


That means whether we truncate add-cow fails or not ,we should not never touch
image_hd size?

>
>> +typedef struct AddCowHeader {
>> +    uint64_t        magic;
>> +    uint32_t        version;
>> +    char            backing_file[ADD_COW_FILE_LEN];
>> +    char            image_file[ADD_COW_FILE_LEN];
>> +    char            reserved[500];
>> +} QEMU_PACKED AddCowHeader;
>
> I'd be tempted to start the bitmap at 4096 bytes into the file.  On 4
> KB block size disks that would be a nice alignment and it doesn't
> waste much additional space (the disk images we're trying to manage
> are many GB :)).

Okay.
>
>> @@ -828,6 +832,41 @@ static int img_convert(int argc, char **argv)
>>     }
>>
>>     /* Create the new image */
>> +
>> +    if (0 == strcmp(out_fmt, "add-cow")) {
>> +        image_drv = bdrv_find_format("raw");
>> +        if (!drv) {
>> +            ret = -1;
>> +            goto out;
>> +        }
>> +        snprintf(image_filename, sizeof(image_filename),
>> +            "%s"".ct.raw", out_filename);
>> +        ret = bdrv_create(image_drv, image_filename, image_param);
>> +        if (ret < 0) {
>> +            error_report("%s: error while creating image_file: %s",
>> +                     image_filename, strerror(-ret));
>> +            goto out;
>> +        }
>> +        set_option_parameter(param, BLOCK_OPT_IMAGE_FILE, image_filename);
>> +
>> +        if (!out_baseimg) {
>> +            backing_drv = bdrv_find_format("qcow2");
>> +            if (!drv) {
>> +                ret = -1;
>> +                goto out;
>> +            }
>> +            snprintf(backing_filename, sizeof(backing_filename),
>> +                "%s"".ct.qcow2", out_filename);
>> +            ret = bdrv_create(backing_drv, backing_filename, image_param);
>> +            if (ret < 0) {
>> +                error_report("%s: error while creating backing_file: %s",
>> +                         backing_filename, strerror(-ret));
>> +                goto out;
>> +            }
>> +            set_option_parameter(param, BLOCK_OPT_BACKING_FILE,
>> +                backing_filename);
>> +        }
>> +    }
>
> If this diff hunk is dropped then the user needs to manually create
> the raw file before running qemu-img convert?
>
> qemu-img convert -O add-cow seems like a very rare case.  I'm not sure
> we should add special user-friend hacks for this.
>
> I'm not sure I understand why you create a qcow2 file either.
>
Yes, if we use "qemu-img convert -O add-cow", we should create 2 other files,
raw file and qcow2(I just picked  up qcow2, other formats is also okay) file,
as image_file and backing_file, without the two files, .add-cow file can not
work properly.

Although it will occour in very rare cases, I wish to pass all qemu-iotests
cases, so I added these code.

Do you think these are not necessary? And some qemu-iotests cases are
using "convert" operation, If I do not write previous code, these cases will
fail. Can I let these cases do not support add-cow?

Really thanks for your review, Stefan.
> Stefan
>

Re: [Qemu-devel] [PATCH 1/3 v9] add-cow file format

Reply via email to